mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Reorganize exceptions for English and German
This commit is contained in:
parent
66c7348cda
commit
311b30ab35
|
@ -5,6 +5,25 @@ from os import path
|
|||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from . import language_data
|
||||
from ..util import update_exc
|
||||
|
||||
from ..language_data import EMOTICONS
|
||||
from .language_data import ORTH_ONLY
|
||||
from .language_data import strings_to_exc
|
||||
from .language_data import get_time_exc
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
|
||||
TAG_MAP = dict(language_data.TAG_MAP)
|
||||
STOP_WORDS = set(language_data.STOP_WORDS)
|
||||
|
||||
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 24 + 1)))
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
@ -14,14 +33,10 @@ class German(Language):
|
|||
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
|
||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
|
||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
|
||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||
|
||||
tag_map = dict(language_data.TAG_MAP)
|
||||
|
||||
stop_words = set(language_data.STOP_WORDS)
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -3,7 +3,21 @@ from __future__ import unicode_literals
|
|||
import re
|
||||
|
||||
from ..symbols import *
|
||||
from ..language_data import EMOTICONS
|
||||
|
||||
|
||||
def strings_to_exc(orths):
|
||||
return {orth: [{ORTH: orth}] for orth in orths}
|
||||
|
||||
|
||||
def get_time_exc(hours):
|
||||
exc = {}
|
||||
for hour in hours:
|
||||
# currently only supporting formats like "10h", not "10 Uhr"
|
||||
exc["%dh" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "h", LEMMA: "Uhr"}
|
||||
]
|
||||
return exc
|
||||
|
||||
|
||||
PRON_LEMMA = "-PRON-"
|
||||
|
@ -655,7 +669,7 @@ TOKENIZER_EXCEPTIONS = {
|
|||
}
|
||||
|
||||
|
||||
self_map = [
|
||||
ORTH_ONLY = [
|
||||
"''",
|
||||
"\\\")",
|
||||
"<space>",
|
||||
|
|
|
@ -9,6 +9,25 @@ from ..lemmatizer import Lemmatizer
|
|||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..attrs import LANG
|
||||
from ..util import update_exc
|
||||
|
||||
from ..language_data import EMOTICONS
|
||||
from .language_data import ORTH_ONLY
|
||||
from .language_data import strings_to_exc
|
||||
from .language_data import get_time_exc
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
|
||||
TAG_MAP = dict(language_data.TAG_MAP)
|
||||
STOP_WORDS = set(language_data.STOP_WORDS)
|
||||
|
||||
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
@ -18,14 +37,9 @@ class English(Language):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
|
||||
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||
|
||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
|
||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
|
||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||
|
||||
tag_map = dict(language_data.TAG_MAP)
|
||||
|
||||
stop_words = set(language_data.STOP_WORDS)
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -3,7 +3,35 @@ from __future__ import unicode_literals
|
|||
import re
|
||||
|
||||
from ..symbols import *
|
||||
from ..language_data import EMOTICONS
|
||||
|
||||
|
||||
def strings_to_exc(orths):
|
||||
return {orth: [{ORTH: orth}] for orth in orths}
|
||||
|
||||
|
||||
def get_time_exc(hours):
|
||||
exc = {}
|
||||
for hour in hours:
|
||||
exc["%da.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dp.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "p.m."}
|
||||
]
|
||||
|
||||
exc["%dam" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "am", LEMMA: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dpm" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "pm", LEMMA: "p.m."}
|
||||
]
|
||||
return exc
|
||||
|
||||
|
||||
PRON_LEMMA = "-PRON-"
|
||||
|
@ -2121,7 +2149,7 @@ TOKENIZER_EXCEPTIONS = {
|
|||
}
|
||||
|
||||
|
||||
self_map = [
|
||||
ORTH_ONLY = [
|
||||
"''",
|
||||
"\")",
|
||||
"a.",
|
||||
|
@ -2185,11 +2213,6 @@ self_map = [
|
|||
"z."
|
||||
]
|
||||
|
||||
for orths in [self_map, EMOTICONS]:
|
||||
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
|
||||
assert not overlap, overlap
|
||||
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = r'''
|
||||
,
|
||||
|
|
Loading…
Reference in New Issue
Block a user