Add comments clarifying matcher logic for '*'

This commit is contained in:
greg 2018-01-22 10:03:12 -05:00
parent 7072b395c9
commit 490bc82c27

View File

@ -361,7 +361,8 @@ cdef class Matcher:
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise Exception("Error selecting action in matcher")
# ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
# acts like and ADVANCE_ZERO
if action == ADVANCE_PLUS: if action == ADVANCE_PLUS:
state.second += 1 state.second += 1
partials.push_back(state) partials.push_back(state)
@ -372,10 +373,13 @@ cdef class Matcher:
if action == ADVANCE: if action == ADVANCE:
state.second += 1 state.second += 1
# Check for partial matches that are at the same spec in the same pattern
# Keep the longer of the matches
# This ensures that there are never more then 2 partials for every spec
# in a pattern (one of which gets pruned in this step)
overlap=False overlap=False
for i in range(q): for i in range(q):
if ent_id != get_pattern_key(partials[i].second):
continue
if state.second == partials[i].second and state.first < partials[i].first: if state.second == partials[i].second and state.first < partials[i].first:
partials[i] = state partials[i] = state
j = i j = i
@ -385,26 +389,12 @@ cdef class Matcher:
continue continue
overlap=False overlap=False
for i in range(q): for i in range(q):
if ent_id != get_pattern_key(partials[i].second):
continue
if state.second == partials[i].second: if state.second == partials[i].second:
overlap = True overlap = True
break break
if overlap: if overlap:
continue continue
# overlap=False
# for i in range(q):
# if state.second == partials[i].second:
# if state.first < partials[i].first:
# partials[i] = state
# j = i-1
# else:
# overlap=True
# break
# if overlap:
# continue
if action == REPEAT: if action == REPEAT:
# Leave the state in the queue, and advance to next slot # Leave the state in the queue, and advance to next slot
@ -425,10 +415,9 @@ cdef class Matcher:
# ent_id = state.second[1].attrs[0].value # ent_id = state.second[1].attrs[0].value
# ent_id = get_pattern_key(state.second) # ent_id = get_pattern_key(state.second)
label = state.second[1].attrs[1].value label = state.second[1].attrs[1].value
# matches.append((ent_id, start, end))
# Check that this match doesn't overlap with an earlier match. # Check that this match doesn't overlap with an earlier match.
# Only overwrite an earlier match if it is a substring of this # Only overwrite an earlier match if it is a substring of this
# match. # match (i.e. it starts after this match starts).
if ent_id not in matches_dict: if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches)) matches_dict[ent_id] = (start,end,len(matches))
@ -454,23 +443,8 @@ cdef class Matcher:
action = get_action(pattern, token) action = get_action(pattern, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise Exception("Error selecting action in matcher")
# while acton == ADVANCE_ZERO:
# pattern += 1
# action = get_action(pattern,token)
# if action == PANIC:
# raise Exception("Error selecting action in matcher")
while action in (ADVANCE_PLUS,ADVANCE_ZERO): while action in (ADVANCE_PLUS,ADVANCE_ZERO):
if action == ADVANCE_PLUS: if action == ADVANCE_PLUS:
# j=0
# overlap = False
# for j in range(q):
# if pattern == partials[j].second:
# overlap = True
# break
# if overlap:
# pattern += 1
# action = get_action(pattern, token)
# continue
state.first = token_i state.first = token_i
state.second = pattern state.second = pattern
partials.push_back(state) partials.push_back(state)
@ -483,8 +457,6 @@ cdef class Matcher:
j=0 j=0
overlap = False overlap = False
for j in range(q): for j in range(q):
if ent_id == get_pattern_key(partials[j].second):
continue
if pattern == partials[j].second: if pattern == partials[j].second:
overlap = True overlap = True
break break
@ -508,7 +480,6 @@ cdef class Matcher:
start = token_i start = token_i
end = token_i+1 if action == ACCEPT else token_i end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value ent_id = pattern[1].attrs[0].value
# ent_id = get_pattern_key(state.second)
label = pattern[1].attrs[1].value label = pattern[1].attrs[1].value
if ent_id not in matches_dict: if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches)) matches_dict[ent_id] = (start,end,len(matches))
@ -531,9 +502,7 @@ cdef class Matcher:
start = state.first start = state.first
end = len(doc) end = len(doc)
ent_id = state.second.attrs[0].value ent_id = state.second.attrs[0].value
# ent_id = get_pattern_key(state.second)
label = state.second.attrs[1].value label = state.second.attrs[1].value
# matches.append((ent_id, start, end))
if ent_id not in matches_dict: if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches)) matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end)) matches.append((ent_id,start,end))