mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
93 lines
3.0 KiB
Python
93 lines
3.0 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..norm_exceptions import BASE_NORMS
|
|
from ...attrs import NORM, LIKE_NUM
|
|
|
|
|
|
# fmt: off
|
|
_stem_suffixes = [
|
|
["ो", "े", "ू", "ु", "ी", "ि", "ा"],
|
|
["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
|
|
["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
|
|
["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
|
|
["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"]
|
|
]
|
|
# fmt: on
|
|
|
|
# reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system
|
|
# reference 2: https://blogs.transparent.com/hindi/hindi-numbers-1-100/
|
|
|
|
_num_words = [
|
|
"शून्य",
|
|
"एक",
|
|
"दो",
|
|
"तीन",
|
|
"चार",
|
|
"पांच",
|
|
"छह",
|
|
"सात",
|
|
"आठ",
|
|
"नौ",
|
|
"दस",
|
|
"ग्यारह",
|
|
"बारह",
|
|
"तेरह",
|
|
"चौदह",
|
|
"पंद्रह",
|
|
"सोलह",
|
|
"सत्रह",
|
|
"अठारह",
|
|
"उन्नीस",
|
|
"बीस",
|
|
"तीस",
|
|
"चालीस",
|
|
"पचास",
|
|
"साठ",
|
|
"सत्तर",
|
|
"अस्सी",
|
|
"नब्बे",
|
|
"सौ",
|
|
"हज़ार",
|
|
"लाख",
|
|
"करोड़",
|
|
"अरब",
|
|
"खरब",
|
|
]
|
|
|
|
|
|
def norm(string):
|
|
# normalise base exceptions, e.g. punctuation or currency symbols
|
|
if string in BASE_NORMS:
|
|
return BASE_NORMS[string]
|
|
# set stem word as norm, if available, adapted from:
|
|
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
|
# http://research.variancia.com/hindi_stemmer/
|
|
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
|
|
for suffix_group in reversed(_stem_suffixes):
|
|
length = len(suffix_group[0])
|
|
if len(string) <= length:
|
|
break
|
|
for suffix in suffix_group:
|
|
if string.endswith(suffix):
|
|
return string[:-length]
|
|
return string
|
|
|
|
|
|
def like_num(text):
|
|
if text.startswith(("+", "-", "±", "~")):
|
|
text = text[1:]
|
|
text = text.replace(", ", "").replace(".", "")
|
|
if text.isdigit():
|
|
return True
|
|
if text.count("/") == 1:
|
|
num, denom = text.split("/")
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num}
|