mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Hyphen infix (#5770)
* infix split on hyphen when preceded by number * clean up * skip ukranian test instead of xfail
This commit is contained in:
		
							parent
							
								
									ec819fc311
								
							
						
					
					
						commit
						1b2ec94382
					
				| 
						 | 
				
			
			@ -3,6 +3,7 @@ from .stop_words import STOP_WORDS
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
| 
						 | 
				
			
			@ -20,6 +21,7 @@ class EnglishDefaults(Language.Defaults):
 | 
			
		|||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    single_orth_variants = [
 | 
			
		||||
        {"tags": ["NFP"], "variants": ["…", "..."]},
 | 
			
		||||
        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										19
									
								
								spacy/lang/en/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/lang/en/punctuation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,19 @@
 | 
			
		|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 | 
			
		||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
			
		||||
 | 
			
		||||
_infixes = (
 | 
			
		||||
    LIST_ELLIPSES
 | 
			
		||||
    + LIST_ICONS
 | 
			
		||||
    + [
 | 
			
		||||
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
 | 
			
		||||
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
 | 
			
		||||
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
 | 
			
		||||
        ),
 | 
			
		||||
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
 | 
			
		||||
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
    ]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
TOKENIZER_INFIXES = _infixes
 | 
			
		||||
| 
						 | 
				
			
			@ -26,9 +26,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 | 
			
		|||
        ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
 | 
			
		||||
        ("""'Me too!', Mr. P. Delaware cried. """, 11),
 | 
			
		||||
        ("They ran about 10km.", 6),
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            "But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
 | 
			
		||||
        ),
 | 
			
		||||
        ("But then the 6,000-year ice age came...", 10),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,8 +16,6 @@ import pytest
 | 
			
		|||
        "grand'hamien",
 | 
			
		||||
        "Châteauneuf-la-Forêt",
 | 
			
		||||
        "Château-Guibert",
 | 
			
		||||
        "11-septembre",
 | 
			
		||||
        "11-Septembre",
 | 
			
		||||
        "refox-trottâmes",
 | 
			
		||||
        # u"K-POP",
 | 
			
		||||
        # u"K-Pop",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user