mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge pull request #1425 from explosion/feature/hindi-tokenizer
💫 Basic Hindi tokenization support
This commit is contained in:
commit
f0d577e460
|
@ -33,7 +33,7 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
|||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
# conflicts, spaCy's base tokenizer should handle all of those by default
|
||||
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·'
|
||||
_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||
_hyphens = '- – — -- --- —— ~'
|
||||
|
||||
|
|
24
spacy/lang/hi/__init__.py
Normal file
24
spacy/lang/hi/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
class HindiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'hi'
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Hindi(Language):
|
||||
lang = 'hi'
|
||||
Defaults = HindiDefaults
|
||||
|
||||
|
||||
__all__ = ['Hindi']
|
38
spacy/lang/hi/lex_attrs.py
Normal file
38
spacy/lang/hi/lex_attrs.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...attrs import NORM
|
||||
from ...util import add_lookups
|
||||
|
||||
|
||||
_stem_suffixes = [
|
||||
["ो","े","ू","ु","ी","ि","ा"],
|
||||
["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
|
||||
["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
|
||||
["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
|
||||
["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
|
||||
]
|
||||
|
||||
|
||||
def norm(string):
|
||||
# normalise base exceptions, e.g. punctuation or currency symbols
|
||||
if string in BASE_NORMS:
|
||||
return BASE_NORMS[string]
|
||||
# set stem word as norm, if available, adapted from:
|
||||
# http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
|
||||
# http://research.variancia.com/hindi_stemmer/
|
||||
# https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
|
||||
for suffix_group in reversed(_stem_suffixes):
|
||||
length = len(suffix_group[0])
|
||||
if len(string) <= length:
|
||||
break
|
||||
for suffix in suffix_group:
|
||||
if string.endswith(suffix):
|
||||
return string[:-length]
|
||||
return string
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
NORM: norm
|
||||
}
|
177
spacy/lang/hi/stop_words.py
Normal file
177
spacy/lang/hi/stop_words.py
Normal file
|
@ -0,0 +1,177 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
|
||||
|
||||
STOP_WORDS = set("""
|
||||
अत
|
||||
अपना
|
||||
अपनी
|
||||
अपने
|
||||
अभी
|
||||
अंदर
|
||||
आदि
|
||||
आप
|
||||
इत्यादि
|
||||
इन
|
||||
इनका
|
||||
इन्हीं
|
||||
इन्हें
|
||||
इन्हों
|
||||
इस
|
||||
इसका
|
||||
इसकी
|
||||
इसके
|
||||
इसमें
|
||||
इसी
|
||||
इसे
|
||||
उन
|
||||
उनका
|
||||
उनकी
|
||||
उनके
|
||||
उनको
|
||||
उन्हीं
|
||||
उन्हें
|
||||
उन्हों
|
||||
उस
|
||||
उसके
|
||||
उसी
|
||||
उसे
|
||||
एक
|
||||
एवं
|
||||
एस
|
||||
ऐसे
|
||||
और
|
||||
कई
|
||||
कर
|
||||
करता
|
||||
करते
|
||||
करना
|
||||
करने
|
||||
करें
|
||||
कहते
|
||||
कहा
|
||||
का
|
||||
काफ़ी
|
||||
कि
|
||||
कितना
|
||||
किन्हें
|
||||
किन्हों
|
||||
किया
|
||||
किर
|
||||
किस
|
||||
किसी
|
||||
किसे
|
||||
की
|
||||
कुछ
|
||||
कुल
|
||||
के
|
||||
को
|
||||
कोई
|
||||
कौन
|
||||
कौनसा
|
||||
गया
|
||||
घर
|
||||
जब
|
||||
जहाँ
|
||||
जा
|
||||
जितना
|
||||
जिन
|
||||
जिन्हें
|
||||
जिन्हों
|
||||
जिस
|
||||
जिसे
|
||||
जीधर
|
||||
जैसा
|
||||
जैसे
|
||||
जो
|
||||
तक
|
||||
तब
|
||||
तरह
|
||||
तिन
|
||||
तिन्हें
|
||||
तिन्हों
|
||||
तिस
|
||||
तिसे
|
||||
तो
|
||||
था
|
||||
थी
|
||||
थे
|
||||
दबारा
|
||||
दिया
|
||||
दुसरा
|
||||
दूसरे
|
||||
दो
|
||||
द्वारा
|
||||
न
|
||||
नके
|
||||
नहीं
|
||||
ना
|
||||
निहायत
|
||||
नीचे
|
||||
ने
|
||||
पर
|
||||
पहले
|
||||
पूरा
|
||||
पे
|
||||
फिर
|
||||
बनी
|
||||
बही
|
||||
बहुत
|
||||
बाद
|
||||
बाला
|
||||
बिलकुल
|
||||
भी
|
||||
भीतर
|
||||
मगर
|
||||
मानो
|
||||
मे
|
||||
में
|
||||
यदि
|
||||
यह
|
||||
यहाँ
|
||||
यही
|
||||
या
|
||||
यिह
|
||||
ये
|
||||
रखें
|
||||
रहा
|
||||
रहे
|
||||
ऱ्वासा
|
||||
लिए
|
||||
लिये
|
||||
लेकिन
|
||||
व
|
||||
वग़ैरह
|
||||
वर्ग
|
||||
वह
|
||||
वहाँ
|
||||
वहीं
|
||||
वाले
|
||||
वुह
|
||||
वे
|
||||
सकता
|
||||
सकते
|
||||
सबसे
|
||||
सभी
|
||||
साथ
|
||||
साबुत
|
||||
साभ
|
||||
सारा
|
||||
से
|
||||
सो
|
||||
संग
|
||||
ही
|
||||
हुआ
|
||||
हुई
|
||||
हुए
|
||||
है
|
||||
हैं
|
||||
हो
|
||||
होता
|
||||
होती
|
||||
होते
|
||||
होना
|
||||
होने
|
||||
""".split())
|
|
@ -82,6 +82,7 @@
|
|||
"pl": "Polish",
|
||||
"he": "Hebrew",
|
||||
"bn": "Bengali",
|
||||
"hi": "Hindi",
|
||||
"id": "Indonesian",
|
||||
"th": "Thai",
|
||||
"zh": "Chinese",
|
||||
|
|
Loading…
Reference in New Issue
Block a user