Merge pull request #1425 from explosion/feature/hindi-tokenizer

💫 Basic Hindi tokenization support
This commit is contained in:
Ines Montani 2017-10-18 13:34:52 +02:00 committed by GitHub
commit f0d577e460
5 changed files with 241 additions and 1 deletions

View File

@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language
# conflicts, spaCy's base tokenizer should handle all of those by default
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ·'
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ·'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉'
_hyphens = '- — -- --- —— ~'

24
spacy/lang/hi/__init__.py Normal file
View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
class HindiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'hi'
stop_words = STOP_WORDS
class Hindi(Language):
lang = 'hi'
Defaults = HindiDefaults
__all__ = ['Hindi']

View File

@ -0,0 +1,38 @@
# coding: utf8
from __future__ import unicode_literals
from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM
from ...util import add_lookups
_stem_suffixes = [
["","","","","","ि",""],
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
]
def norm(string):
# normalise base exceptions, e.g. punctuation or currency symbols
if string in BASE_NORMS:
return BASE_NORMS[string]
# set stem word as norm, if available, adapted from:
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
# http://research.variancia.com/hindi_stemmer/
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
for suffix_group in reversed(_stem_suffixes):
length = len(suffix_group[0])
if len(string) <= length:
break
for suffix in suffix_group:
if string.endswith(suffix):
return string[:-length]
return string
LEX_ATTRS = {
NORM: norm
}

177
spacy/lang/hi/stop_words.py Normal file
View File

@ -0,0 +1,177 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
STOP_WORDS = set("""
अत
अपन
अपन
अपन
अभ
दर
आदि
आप
इति
इन
इनक
इन
इन
इन
इस
इसक
इसक
इसक
इसम
इस
इस
उन
उनक
उनक
उनक
उनक
उन
उन
उन
उस
उसक
उस
उस
एक
एव
एस
ऐस
और
कई
कर
करत
करत
करन
करन
कर
कहत
कह
ि
ितन
ि
ि
ि
ि
ि
ि
ि
नस
गय
घर
जब
जह
ितन
ि
ि
ि
ि
ि
धर
तक
तब
तरह
ि
ि
ि
ि
ि
दब
ि
सर
सर
नक
नह
ियत
पर
पहल
ि
बन
बह
बह
िलक
तर
मगर
यदि
यह
यह
यह
ि
रख
रह
रह
ि
ि
ि
वग़रह
वर
वह
वह
वह
सकत
सकत
सबस
सभ
""".split())

View File

@ -82,6 +82,7 @@
"pl": "Polish",
"he": "Hebrew",
"bn": "Bengali",
"hi": "Hindi",
"id": "Indonesian",
"th": "Thai",
"zh": "Chinese",