mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 16:52:21 +03:00
* Add a few abbreviations, to get tests to pass
This commit is contained in:
parent
db191361ee
commit
7959141d36
|
@ -77,7 +77,7 @@ cdef class English(Language):
|
||||||
i += 1
|
i += 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
|
||||||
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
||||||
cdef unicode char_i = characters[i]
|
cdef unicode char_i = characters[i]
|
||||||
cdef unicode char_i1 = characters[i+1]
|
cdef unicode char_i1 = characters[i+1]
|
||||||
|
@ -91,8 +91,9 @@ cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
||||||
# Don't count commas as punct if the next char is a number
|
# Don't count commas as punct if the next char is a number
|
||||||
if characters[i] == "," and i < (length - 1) and char_i1.isdigit():
|
if characters[i] == "," and i < (length - 1) and char_i1.isdigit():
|
||||||
return False
|
return False
|
||||||
# Don't count periods as punct if the next char is not whitespace
|
if characters[i] == "." and i < (length - 1):
|
||||||
if characters[i] == ".":
|
return False
|
||||||
|
if characters[i] == "." and characters[:i] in abbreviations:
|
||||||
return False
|
return False
|
||||||
return not char_i.isalnum()
|
return not char_i.isalnum()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user