mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
112 lines
2.2 KiB
Python
112 lines
2.2 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..symbols import *
|
|
from ..language_data import PRON_LEMMA
|
|
|
|
TOKENIZER_EXCEPTIONS = {}
|
|
|
|
# Contractions
|
|
CONTRACTIONS = {}
|
|
|
|
personal_pronoun = (
|
|
"ele", "ela", "eles", "elas"
|
|
)
|
|
demonstrative_pronouns = (
|
|
"este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas",
|
|
"isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo"
|
|
)
|
|
undefined_pronouns = (
|
|
"outro", "outra", "outros", "outras"
|
|
)
|
|
adverbs = (
|
|
"aqui", "aí", "ali", "além"
|
|
)
|
|
|
|
for word in personal_pronoun + demonstrative_pronouns + \
|
|
undefined_pronouns + adverbs:
|
|
CONTRACTIONS["d" + word] = [
|
|
{ORTH: "d", NORM: "de"},
|
|
{ORTH: word}
|
|
]
|
|
|
|
for word in personal_pronoun + demonstrative_pronouns + \
|
|
undefined_pronouns:
|
|
CONTRACTIONS["n" + word] = [
|
|
{ORTH: "n", NORM: "em"},
|
|
{ORTH: word}
|
|
]
|
|
|
|
# Not so linear contractions "a"+something
|
|
|
|
CONTRACTIONS.update({
|
|
# This one cannot be split into 2
|
|
# "à": [
|
|
# {ORTH: "à", NORM: "a"},
|
|
# {ORTH: "", NORM: "a"}
|
|
# ],
|
|
"às": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "s", NORM: "as"}
|
|
],
|
|
"ao": [
|
|
{ORTH: "a"},
|
|
{ORTH: "o"}
|
|
],
|
|
"aos": [
|
|
{ORTH: "a"},
|
|
{ORTH: "os"}
|
|
],
|
|
"àquele": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "quele", NORM: "aquele"}
|
|
],
|
|
"àquela": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "quela", NORM: "aquela"}
|
|
],
|
|
"àqueles": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "queles", NORM: "aqueles"}
|
|
],
|
|
"àquelas": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "quelas", NORM: "aquelas"}
|
|
],
|
|
"àquilo": [
|
|
{ORTH: "à", NORM: "a"},
|
|
{ORTH: "quilo", NORM: "aquilo"}
|
|
],
|
|
"aonde": [
|
|
{ORTH: "a"},
|
|
{ORTH: "onde"}
|
|
],
|
|
})
|
|
|
|
TOKENIZER_EXCEPTIONS.update(CONTRACTIONS)
|
|
|
|
# Abbreviations with only one ORTH token
|
|
|
|
ORTH_ONLY = [
|
|
"Adm.",
|
|
"Dr.",
|
|
"e.g.",
|
|
"E.g.",
|
|
"E.G.",
|
|
"Gen.",
|
|
"Gov.",
|
|
"i.e.",
|
|
"I.e.",
|
|
"I.E.",
|
|
"Jr.",
|
|
"Ltd.",
|
|
"p.m.",
|
|
"Ph.D.",
|
|
"Rep.",
|
|
"Rev.",
|
|
"Sen.",
|
|
"Sr.",
|
|
"Sra.",
|
|
"vs.",
|
|
]
|