mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Improve Ligurian tokenization
This commit is contained in:
		
							parent
							
								
									3e30b5bef6
								
							
						
					
					
						commit
						8a469f06a4
					
				| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
from ...language import BaseDefaults, Language
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		|||
class LigurianDefaults(BaseDefaults):
 | 
			
		||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models.
 | 
			
		|||
sentences = [
 | 
			
		||||
    "Sciusciâ e sciorbî no se peu.",
 | 
			
		||||
    "Graçie di çetroin, che me son arrivæ.",
 | 
			
		||||
    "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
 | 
			
		||||
    "Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
 | 
			
		||||
    "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
 | 
			
		||||
]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,23 @@
 | 
			
		|||
from ..punctuation import (
 | 
			
		||||
    TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
 | 
			
		||||
    TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
 | 
			
		||||
)
 | 
			
		||||
from ..char_classes import ALPHA
 | 
			
		||||
from ..punctuation import TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_infixes = TOKENIZER_INFIXES + [
 | 
			
		||||
    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
 | 
			
		||||
ELISION = "'’"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_prefixes = [
 | 
			
		||||
    r"['’‘][0-9]{2}",  # shorthand for years
 | 
			
		||||
    r"[0-9]+°(?![cfkCFK])",  # use of degree symbol as ordinal indicator
 | 
			
		||||
    r"[{el}‘]nn?[{el}]?".format(el=ELISION),  # elided forms of "un(na)"
 | 
			
		||||
] + BASE_TOKENIZER_PREFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_infixes = BASE_TOKENIZER_INFIXES + [
 | 
			
		||||
    r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
TOKENIZER_PREFIXES = _prefixes
 | 
			
		||||
TOKENIZER_INFIXES = _infixes
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,38 +1,40 @@
 | 
			
		|||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
 | 
			
		||||
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva
 | 
			
		||||
 | 
			
		||||
bella belle belli bello ben
 | 
			
		||||
bell' bell’ bella belle belli bello ben
 | 
			
		||||
 | 
			
		||||
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
 | 
			
		||||
ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse
 | 
			
		||||
 | 
			
		||||
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
 | 
			
		||||
d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento
 | 
			
		||||
 | 
			
		||||
é e ê ea ean emmo en ëse
 | 
			
		||||
é à e ê ea ean emmo en ëse
 | 
			
		||||
 | 
			
		||||
fin fiña
 | 
			
		||||
 | 
			
		||||
gh' ghe guæei
 | 
			
		||||
gh' gh’ ghe guæi
 | 
			
		||||
 | 
			
		||||
i î in insemme int' inta inte inti into
 | 
			
		||||
i î in insemme int' int’ inta inte inti into
 | 
			
		||||
 | 
			
		||||
l' lê lì lô
 | 
			
		||||
l' l’ lê lì liatre liatri lô loiatre loiatri
 | 
			
		||||
 | 
			
		||||
m' ma manco me megio meno mezo mi
 | 
			
		||||
m' m’ ma mai manco me megio meno meza meze mezi mezo mi
 | 
			
		||||
 | 
			
		||||
na n' ne ni ninte nisciun nisciuña no
 | 
			
		||||
n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri
 | 
			
		||||
 | 
			
		||||
o ò ô oua
 | 
			
		||||
 | 
			
		||||
parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
 | 
			
		||||
 | 
			
		||||
quæ quand' quande quarche quella quelle quelli quello
 | 
			
		||||
quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello
 | 
			
		||||
 | 
			
		||||
s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
 | 
			
		||||
s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto
 | 
			
		||||
 | 
			
		||||
tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
 | 
			
		||||
tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto
 | 
			
		||||
 | 
			
		||||
un uña unn' unna
 | 
			
		||||
un uña unn' unn’ unna
 | 
			
		||||
 | 
			
		||||
voî voscià
 | 
			
		||||
 | 
			
		||||
za zu
 | 
			
		||||
""".split()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,49 +1,67 @@
 | 
			
		|||
from ...symbols import ORTH
 | 
			
		||||
from ...symbols import ORTH, NORM
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Returns capitalized variants, all caps variants and with curly apostrophe
 | 
			
		||||
def _variants(orth, exc):
 | 
			
		||||
    yield orth, exc
 | 
			
		||||
    yield orth.capitalize(), [
 | 
			
		||||
        {ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
 | 
			
		||||
        for i, e in enumerate(exc)
 | 
			
		||||
    ]
 | 
			
		||||
    yield orth.upper(), [
 | 
			
		||||
        {ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
 | 
			
		||||
    ]
 | 
			
		||||
    if "'" in orth:
 | 
			
		||||
        yield from _variants(
 | 
			
		||||
            orth.replace("'", "’"),
 | 
			
		||||
            [
 | 
			
		||||
                {ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])}
 | 
			
		||||
                for e in exc
 | 
			
		||||
            ],
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_exc = {}
 | 
			
		||||
 | 
			
		||||
for raw in [
 | 
			
		||||
    "a-e",
 | 
			
		||||
    "a-o",
 | 
			
		||||
    "a-i",
 | 
			
		||||
    "a-a",
 | 
			
		||||
    "co-a",
 | 
			
		||||
    "co-e",
 | 
			
		||||
    "co-i",
 | 
			
		||||
    "co-o",
 | 
			
		||||
    "da-a",
 | 
			
		||||
    "da-e",
 | 
			
		||||
    "da-i",
 | 
			
		||||
    "da-o",
 | 
			
		||||
    "pe-a",
 | 
			
		||||
    "pe-e",
 | 
			
		||||
    "pe-i",
 | 
			
		||||
    "pe-o",
 | 
			
		||||
]:
 | 
			
		||||
    for orth in [raw, raw.capitalize()]:
 | 
			
		||||
        _exc[orth] = [{ORTH: orth}]
 | 
			
		||||
# Compound prepositions
 | 
			
		||||
 | 
			
		||||
# Prefix + prepositions with à (e.g. "sott'a-o")
 | 
			
		||||
# Compounds with "inte" and "de" aren't split as they can be ambiguous
 | 
			
		||||
# Format: (compound form, isolated form, determiners it goes with)
 | 
			
		||||
_preps = [
 | 
			
		||||
    ("a-", "à", "oaie"),
 | 
			
		||||
    ("co-", "con", "oaie"),
 | 
			
		||||
    ("da-", "da", "oaie"),
 | 
			
		||||
    ("pe-", "pe", "oaie"),
 | 
			
		||||
    ("pi-", "pe", "a"),  # colloquialism
 | 
			
		||||
    ("de-", "de", "oaie"),  # incorrect, but occasionally seen
 | 
			
		||||
    ("ne-", "inte", "oaie"),  # incorrect, but occasionally seen
 | 
			
		||||
]
 | 
			
		||||
for prep_, prep, dets in _preps:
 | 
			
		||||
    for det in dets:
 | 
			
		||||
        for orth, exc in _variants(
 | 
			
		||||
            prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
 | 
			
		||||
        ):
 | 
			
		||||
            _exc[orth] = exc
 | 
			
		||||
 | 
			
		||||
for prep in [
 | 
			
		||||
    "a-a",
 | 
			
		||||
    "a-e",
 | 
			
		||||
    "a-o",
 | 
			
		||||
    "a-i",
 | 
			
		||||
]:
 | 
			
		||||
    for prefix in [
 | 
			
		||||
        "sott'",
 | 
			
		||||
        "sott’",
 | 
			
		||||
        "contr'",
 | 
			
		||||
        "contr’",
 | 
			
		||||
        "ch'",
 | 
			
		||||
        "ch’",
 | 
			
		||||
        "s'",
 | 
			
		||||
        "s’",
 | 
			
		||||
    ]:
 | 
			
		||||
        for prefix_orth in [prefix, prefix.capitalize()]:
 | 
			
		||||
            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
 | 
			
		||||
# Units
 | 
			
		||||
 | 
			
		||||
for u in "cfkCFK":
 | 
			
		||||
    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
 | 
			
		||||
    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
 | 
			
		||||
 | 
			
		||||
# Other exceptions
 | 
			
		||||
 | 
			
		||||
_other_exc = {
 | 
			
		||||
    "'n'": [{ORTH: "'n'", NORM: "unna"}],
 | 
			
		||||
    "‘n'": [{ORTH: "‘n'", NORM: "unna"}],
 | 
			
		||||
    "'n": [{ORTH: "'n", NORM: "un"}],
 | 
			
		||||
    "‘n": [{ORTH: "‘n", NORM: "un"}],
 | 
			
		||||
    "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
 | 
			
		||||
}
 | 
			
		||||
for orth_, exc_ in _other_exc.items():
 | 
			
		||||
    for orth, exc in _variants(orth_, exc_):
 | 
			
		||||
        _exc[orth] = exc
 | 
			
		||||
 | 
			
		||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -282,6 +282,11 @@ def lg_tokenizer():
 | 
			
		|||
    return get_lang_class("lg")().tokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def lij_tokenizer():
 | 
			
		||||
    return get_lang_class("lij")().tokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def lt_tokenizer():
 | 
			
		||||
    return get_lang_class("lt")().tokenizer
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										0
									
								
								spacy/tests/lang/lij/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/lij/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										13
									
								
								spacy/tests/lang/lij/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/lij/test_exceptions.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,13 @@
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text,expected_tokens,expected_norms",
 | 
			
		||||
    [("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
 | 
			
		||||
)
 | 
			
		||||
def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
 | 
			
		||||
    """Test that compound prepositions are split correctly."""
 | 
			
		||||
    tokens = lij_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 2
 | 
			
		||||
    assert [t.text for t in tokens] == expected_tokens
 | 
			
		||||
    assert [t.norm_ for t in tokens] == expected_norms
 | 
			
		||||
							
								
								
									
										24
									
								
								spacy/tests/lang/lij/test_prefix_suffix_infix.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/tests/lang/lij/test_prefix_suffix_infix.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,24 @@
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text", ["'90", "’90", "‘90"])
 | 
			
		||||
def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
 | 
			
		||||
    """Test that elided years (e.g. '90 for 1990) are not split."""
 | 
			
		||||
    tokens = lij_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
 | 
			
		||||
def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
 | 
			
		||||
    """Test that in degree units the degree symbol isn't split from the unit."""
 | 
			
		||||
    tokens = lij_tokenizer(text)
 | 
			
		||||
    token_list = [token.text for token in tokens if not token.is_space]
 | 
			
		||||
    assert expected_tokens == token_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
 | 
			
		||||
def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
 | 
			
		||||
    """Test that left-eliding expressions are not split from their left apostrophe."""
 | 
			
		||||
    tokens = lij_tokenizer(text)
 | 
			
		||||
    token_list = [token.text for token in tokens if not token.is_space]
 | 
			
		||||
    assert expected_tokens == token_list
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user