* Make suffixes file use full-power regex, so that we can handle periods properly

This commit is contained in:
Matthew Honnibal 2014-12-09 19:04:27 +11:00
parent accdbe989b
commit b962fe73d7
2 changed files with 14 additions and 15 deletions

View File

@ -1,13 +1,13 @@
,
"
)
]
}
*
!
?
\"
\)
\]
\}
\*
\!
\?
%
$
\$
>
:
;
@ -16,6 +16,8 @@ $
''
's
'S
..
...
....
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9])\.
(?<=[0-9])km

View File

@ -31,10 +31,7 @@ def read_prefix(data_dir):
def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
# TODO: Fix this hack!
expression += r'|(?<=[a-z0-9])\.$'
expression += r'|(?<=[0-9])km$'
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return expression