Merge pull request #1 from jeannefukumaru/added-indonesian-tag-map

Added indonesian tag map
This commit is contained in:
jeannefukumaru 2019-04-03 23:05:05 +08:00 committed by GitHub
commit 99e04c4ce2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 0 deletions

View File

@ -8,6 +8,7 @@ from .norm_exceptions import NORM_EXCEPTIONS
from .lemmatizer import LOOKUP
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -30,6 +31,7 @@ class IndonesianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
tag_map = TAG_MAP
class Indonesian(Language):

34
spacy/lang/id/tag_map.py Normal file
View File

@ -0,0 +1,34 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PRON, AUX, SCONJ
# POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014
TAG_MAP = {
"NSD": {POS: NOUN},
"Z": {POS: PUNCT},
"VSA": {POS: VERB},
"CC-": {POS: NUM},
"R": {POS: ADP},
"D": {POS: ADV},
"ASP": {POS: ADJ},
"S": {POS: SCONJ},
"VSP": {POS: VERB},
"H": {POS: CCONJ},
"F": {POS: X},
"B": {POS: DET},
"CO-": {POS: NUM},
"G": {POS: ADV},
"PS3": {POS: PRON},
"W": {POS: ADV},
"O": {POS: AUX},
"PP1": {POS: PRON},
"ASS": {POS: ADJ},
"PS1": {POS: PRON},
"APP": {POS: ADJ},
"CD-": {POS: NUM},
"VPA": {POS: VERB},
"VPP": {POS: VERB},
}