mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-08 09:43:41 +03:00
Reorganize exceptions for English and German
This commit is contained in:
parent
66c7348cda
commit
311b30ab35
|
@ -5,6 +5,25 @@ from os import path
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from . import language_data
|
from . import language_data
|
||||||
|
from ..util import update_exc
|
||||||
|
|
||||||
|
from ..language_data import EMOTICONS
|
||||||
|
from .language_data import ORTH_ONLY
|
||||||
|
from .language_data import strings_to_exc
|
||||||
|
from .language_data import get_time_exc
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||||
|
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
||||||
|
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||||
|
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
|
||||||
|
TAG_MAP = dict(language_data.TAG_MAP)
|
||||||
|
STOP_WORDS = set(language_data.STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 24 + 1)))
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
@ -15,13 +34,9 @@ class German(Language):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'de'
|
lex_attr_getters[LANG] = lambda text: 'de'
|
||||||
|
|
||||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
tag_map = TAG_MAP
|
||||||
|
stop_words = STOP_WORDS
|
||||||
tag_map = dict(language_data.TAG_MAP)
|
|
||||||
|
|
||||||
stop_words = set(language_data.STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,21 @@ from __future__ import unicode_literals
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
from ..language_data import EMOTICONS
|
|
||||||
|
|
||||||
|
def strings_to_exc(orths):
|
||||||
|
return {orth: [{ORTH: orth}] for orth in orths}
|
||||||
|
|
||||||
|
|
||||||
|
def get_time_exc(hours):
|
||||||
|
exc = {}
|
||||||
|
for hour in hours:
|
||||||
|
# currently only supporting formats like "10h", not "10 Uhr"
|
||||||
|
exc["%dh" % hour] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: "h", LEMMA: "Uhr"}
|
||||||
|
]
|
||||||
|
return exc
|
||||||
|
|
||||||
|
|
||||||
PRON_LEMMA = "-PRON-"
|
PRON_LEMMA = "-PRON-"
|
||||||
|
@ -655,7 +669,7 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
self_map = [
|
ORTH_ONLY = [
|
||||||
"''",
|
"''",
|
||||||
"\\\")",
|
"\\\")",
|
||||||
"<space>",
|
"<space>",
|
||||||
|
|
|
@ -9,6 +9,25 @@ from ..lemmatizer import Lemmatizer
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokenizer import Tokenizer
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
from ..util import update_exc
|
||||||
|
|
||||||
|
from ..language_data import EMOTICONS
|
||||||
|
from .language_data import ORTH_ONLY
|
||||||
|
from .language_data import strings_to_exc
|
||||||
|
from .language_data import get_time_exc
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||||
|
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
||||||
|
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||||
|
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
|
||||||
|
TAG_MAP = dict(language_data.TAG_MAP)
|
||||||
|
STOP_WORDS = set(language_data.STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
@ -18,14 +37,9 @@ class English(Language):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'en'
|
lex_attr_getters[LANG] = lambda text: 'en'
|
||||||
|
|
||||||
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
tag_map = TAG_MAP
|
||||||
|
stop_words = STOP_WORDS
|
||||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
|
||||||
|
|
||||||
tag_map = dict(language_data.TAG_MAP)
|
|
||||||
|
|
||||||
stop_words = set(language_data.STOP_WORDS)
|
|
||||||
|
|
|
@ -3,7 +3,35 @@ from __future__ import unicode_literals
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
from ..language_data import EMOTICONS
|
|
||||||
|
|
||||||
|
def strings_to_exc(orths):
|
||||||
|
return {orth: [{ORTH: orth}] for orth in orths}
|
||||||
|
|
||||||
|
|
||||||
|
def get_time_exc(hours):
|
||||||
|
exc = {}
|
||||||
|
for hour in hours:
|
||||||
|
exc["%da.m." % hour] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: "a.m."}
|
||||||
|
]
|
||||||
|
|
||||||
|
exc["%dp.m." % hour] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: "p.m."}
|
||||||
|
]
|
||||||
|
|
||||||
|
exc["%dam" % hour] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: "am", LEMMA: "a.m."}
|
||||||
|
]
|
||||||
|
|
||||||
|
exc["%dpm" % hour] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: "pm", LEMMA: "p.m."}
|
||||||
|
]
|
||||||
|
return exc
|
||||||
|
|
||||||
|
|
||||||
PRON_LEMMA = "-PRON-"
|
PRON_LEMMA = "-PRON-"
|
||||||
|
@ -2121,7 +2149,7 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
self_map = [
|
ORTH_ONLY = [
|
||||||
"''",
|
"''",
|
||||||
"\")",
|
"\")",
|
||||||
"a.",
|
"a.",
|
||||||
|
@ -2185,11 +2213,6 @@ self_map = [
|
||||||
"z."
|
"z."
|
||||||
]
|
]
|
||||||
|
|
||||||
for orths in [self_map, EMOTICONS]:
|
|
||||||
overlap = set(TOKENIZER_EXCEPTIONS.keys()).intersection(set(orths))
|
|
||||||
assert not overlap, overlap
|
|
||||||
TOKENIZER_EXCEPTIONS.update({orth: [{ORTH: orth}] for orth in orths})
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = r'''
|
TOKENIZER_PREFIXES = r'''
|
||||||
,
|
,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user