mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
* Make suffixes file use full-power regex, so that we can handle periods properly
This commit is contained in:
parent
accdbe989b
commit
b962fe73d7
|
@ -1,13 +1,13 @@
|
||||||
,
|
,
|
||||||
"
|
\"
|
||||||
)
|
\)
|
||||||
]
|
\]
|
||||||
}
|
\}
|
||||||
*
|
\*
|
||||||
!
|
\!
|
||||||
?
|
\?
|
||||||
%
|
%
|
||||||
$
|
\$
|
||||||
>
|
>
|
||||||
:
|
:
|
||||||
;
|
;
|
||||||
|
@ -16,6 +16,8 @@ $
|
||||||
''
|
''
|
||||||
's
|
's
|
||||||
'S
|
'S
|
||||||
..
|
\.\.
|
||||||
...
|
\.\.\.
|
||||||
....
|
\.\.\.\.
|
||||||
|
(?<=[a-z0-9])\.
|
||||||
|
(?<=[0-9])km
|
||||||
|
|
|
@ -31,10 +31,7 @@ def read_prefix(data_dir):
|
||||||
def read_suffix(data_dir):
|
def read_suffix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||||
# TODO: Fix this hack!
|
|
||||||
expression += r'|(?<=[a-z0-9])\.$'
|
|
||||||
expression += r'|(?<=[0-9])km$'
|
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user