mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 20:51:30 +03:00
* Make suffixes file use full-power regex, so that we can handle periods properly
This commit is contained in:
parent
accdbe989b
commit
b962fe73d7
|
@ -1,13 +1,13 @@
|
|||
,
|
||||
"
|
||||
)
|
||||
]
|
||||
}
|
||||
*
|
||||
!
|
||||
?
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
%
|
||||
$
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
|
@ -16,6 +16,8 @@ $
|
|||
''
|
||||
's
|
||||
'S
|
||||
..
|
||||
...
|
||||
....
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9])\.
|
||||
(?<=[0-9])km
|
||||
|
|
|
@ -31,10 +31,7 @@ def read_prefix(data_dir):
|
|||
def read_suffix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||
# TODO: Fix this hack!
|
||||
expression += r'|(?<=[a-z0-9])\.$'
|
||||
expression += r'|(?<=[0-9])km$'
|
||||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user