spaCy/spacy/lang/zh/lex_attrs.py

import re

from ...attrs import LIKE_NUM


_single_num_words = [
    "〇",
    "一",
    "二",
    "三",
    "四",
    "五",
    "六",
    "七",
    "八",
    "九",
    "十",
    "十一",
    "十二",
    "十三",
    "十四",
    "十五",
    "十六",
    "十七",
    "十八",
    "十九",
    "廿",
    "卅",
    "卌",
    "皕",
    "零",
    "壹",
    "贰",
    "叁",
    "肆",
    "伍",
    "陆",
    "柒",
    "捌",
    "玖",
    "拾",
    "拾壹",
    "拾贰",
    "拾叁",
    "拾肆",
    "拾伍",
    "拾陆",
    "拾柒",
    "拾捌",
    "拾玖",
]

_count_num_words = [
    "一",
    "二",
    "三",
    "四",
    "五",
    "六",
    "七",
    "八",
    "九",
    "壹",
    "贰",
    "叁",
    "肆",
    "伍",
    "陆",
    "柒",
    "捌",
    "玖",
]

_base_num_words = ["十", "百", "千", "万", "亿", "兆", "拾", "佰", "仟"]


def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "").replace("，", "").replace("。", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _single_num_words:
        return True
    # fmt: off
    if re.match('^((' + '|'.join(_count_num_words) + '){1}'
                + '(' + '|'.join(_base_num_words) + '){1})+'
                + '(' + '|'.join(_count_num_words) + ')?$', text):
        return True
    # fmt: on
    return False


LEX_ATTRS = {LIKE_NUM: like_num}
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								import re
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								from ...attrs import LIKE_NUM
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								_single_num_words = [
 								    "〇",
 								    "一",
 								    "二",
 								    "三",
 								    "四",
 								    "五",
 								    "六",
 								    "七",
 								    "八",
 								    "九",
 								    "十",
 								    "十一",
 								    "十二",
 								    "十三",
 								    "十四",
 								    "十五",
 								    "十六",
 								    "十七",
 								    "十八",
 								    "十九",
 								    "廿",
 								    "卅",
 								    "卌",
 								    "皕",
 								    "零",
 								    "壹",
 								    "贰",
 								    "叁",
 								    "肆",
 								    "伍",
 								    "陆",
 								    "柒",
 								    "捌",
 								    "玖",
 								    "拾",
 								    "拾壹",
 								    "拾贰",
 								    "拾叁",
 								    "拾肆",
 								    "拾伍",
 								    "拾陆",
 								    "拾柒",
 								    "拾捌",
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								    "拾玖",
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								]
 								_count_num_words = [
 								    "一",
 								    "二",
 								    "三",
 								    "四",
 								    "五",
 								    "六",
 								    "七",
 								    "八",
 								    "九",
 								    "壹",
 								    "贰",
 								    "叁",
 								    "肆",
 								    "伍",
 								    "陆",
 								    "柒",
 								    "捌",
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								    "玖",
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								]
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								_base_num_words = ["十", "百", "千", "万", "亿", "兆", "拾", "佰", "仟"]
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
 								def like_num(text):
 								    if text.startswith(("+", "-", "±", "~")):
 								        text = text[1:]
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								    text = text.replace(",", "").replace(".", "").replace("，", "").replace("。", "")
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								    if text.isdigit():
 								        return True
 								    if text.count("/") == 1:
 								        num, denom = text.split("/")
 								        if num.isdigit() and denom.isdigit():
 								            return True
 								    if text in _single_num_words:
 								        return True
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								    # fmt: off
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								    if re.match('^((' + '|'.join(_count_num_words) + '){1}'
 								                + '(' + '|'.join(_base_num_words) + '){1})+'
 								                + '(' + '|'.join(_count_num_words) + ')?$', text):
 								        return True
-												Tidy up and auto-format

											
										
										
											2019-08-20 18:36:34 +03:00
+								    # fmt: on
-												update lang/zh (#4103)

* update lang/zh

* update lang/zh

											
										
										
											2019-08-12 11:37:48 +03:00
+								    return False
 								LEX_ATTRS = {LIKE_NUM: like_num}