mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-15 06:09:01 +03:00
71884d0942
Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
35 lines
741 B
Python
35 lines
741 B
Python
from ...attrs import LIKE_NUM
|
|
import re
|
|
|
|
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
|
|
roman_numerals_compile = re.compile(
|
|
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
|
)
|
|
|
|
_num_words = set(
|
|
"""
|
|
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
|
""".split()
|
|
)
|
|
|
|
_ordinal_words = set(
|
|
"""
|
|
primus prima primum secundus secunda secundum tertius tertia tertium
|
|
""".split()
|
|
)
|
|
|
|
|
|
def like_num(text):
|
|
if text.isdigit():
|
|
return True
|
|
if roman_numerals_compile.match(text):
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
if text.lower() in _ordinal_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {LIKE_NUM: like_num}
|