spaCy/spacy/lang/lij/tokenizer_exceptions.py

from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS

_exc = {}

for raw in [
    "a-e",
    "a-o",
    "a-i",
    "a-a",
    "co-a",
    "co-e",
    "co-i",
    "co-o",
    "da-a",
    "da-e",
    "da-i",
    "da-o",
    "pe-a",
    "pe-e",
    "pe-i",
    "pe-o",
]:
    for orth in [raw, raw.capitalize()]:
        _exc[orth] = [{ORTH: orth}]

# Prefix + prepositions with à (e.g. "sott'a-o")

for prep in [
    "a-a",
    "a-e",
    "a-o",
    "a-i",
]:
    for prefix in [
        "sott'",
        "sott’",
        "contr'",
        "contr’",
        "ch'",
        "ch’",
        "s'",
        "s’",
    ]:
        for prefix_orth in [prefix, prefix.capitalize()]:
            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								from ...symbols import ORTH
-												Tidy up and move noun_chunks, token_match, url_match

											
										
										
											2020-07-22 23:18:46 +03:00
+								from ...util import update_exc
-												isort all the things

											
										
										
											2023-06-26 12:41:03 +03:00
+								from ..tokenizer_exceptions import BASE_EXCEPTIONS
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
 								_exc = {}
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								for raw in [
 								    "a-e",
 								    "a-o",
 								    "a-i",
 								    "a-a",
 								    "co-a",
 								    "co-e",
 								    "co-i",
 								    "co-o",
 								    "da-a",
 								    "da-e",
 								    "da-i",
 								    "da-o",
 								    "pe-a",
 								    "pe-e",
 								    "pe-i",
 								    "pe-o",
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
+								]:
 								    for orth in [raw, raw.capitalize()]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								        _exc[orth] = [{ORTH: orth}]
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
 								# Prefix + prepositions with à (e.g. "sott'a-o")
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								for prep in [
 								    "a-a",
 								    "a-e",
 								    "a-o",
 								    "a-i",
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
+								]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								    for prefix in [
 								        "sott'",
 								        "sott’",
 								        "contr'",
 								        "contr’",
 								        "ch'",
 								        "ch’",
 								        "s'",
 								        "s’",
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
+								    ]:
 								        for prefix_orth in [prefix, prefix.capitalize()]:
-												Remove POS, TAG and LEMMA from tokenizer exceptions

											
										
										
											2020-07-23 00:09:01 +03:00
+								            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
-												Add Ligurian language

											
										
										
											2020-03-20 07:20:17 +03:00
-												Tidy up and move noun_chunks, token_match, url_match

											
										
										
											2020-07-22 23:18:46 +03:00
+								TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)