mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
53 lines
1.2 KiB
Python
53 lines
1.2 KiB
Python
# coding: utf8
|
||
from __future__ import unicode_literals
|
||
from ...symbols import ORTH, LEMMA
|
||
|
||
_exc = {}
|
||
|
||
for raw, lemma in [
|
||
("a-a", "a-o"),
|
||
("a-e", "a-o"),
|
||
("a-o", "a-o"),
|
||
("a-i", "a-o"),
|
||
("co-a", "co-o"),
|
||
("co-e", "co-o"),
|
||
("co-i", "co-o"),
|
||
("co-o", "co-o"),
|
||
("da-a", "da-o"),
|
||
("da-e", "da-o"),
|
||
("da-i", "da-o"),
|
||
("da-o", "da-o"),
|
||
("pe-a", "pe-o"),
|
||
("pe-e", "pe-o"),
|
||
("pe-i", "pe-o"),
|
||
("pe-o", "pe-o"),
|
||
]:
|
||
for orth in [raw, raw.capitalize()]:
|
||
_exc[orth] = [{ORTH: orth, LEMMA: lemma}]
|
||
|
||
# Prefix + prepositions with à (e.g. "sott'a-o")
|
||
|
||
for prep, prep_lemma in [
|
||
("a-a", "a-o"),
|
||
("a-e", "a-o"),
|
||
("a-o", "a-o"),
|
||
("a-i", "a-o"),
|
||
]:
|
||
for prefix, prefix_lemma in [
|
||
("sott'", "sotta"),
|
||
("sott’", "sotta"),
|
||
("contr'", "contra"),
|
||
("contr’", "contra"),
|
||
("ch'", "che"),
|
||
("ch’", "che"),
|
||
("s'", "se"),
|
||
("s’", "se"),
|
||
]:
|
||
for prefix_orth in [prefix, prefix.capitalize()]:
|
||
_exc[prefix_orth + prep] = [
|
||
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
||
{ORTH: prep, LEMMA: prep_lemma},
|
||
]
|
||
|
||
TOKENIZER_EXCEPTIONS = _exc
|