mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Move prefix and suffix detection for URL_PATTERN Move prefix and suffix detection for `URL_PATTERN` into the tokenizer. Remove associated lookahead and lookbehind from `URL_PATTERN`. Fix tokenization for Hungarian given new modified handling of prefixes and suffixes. * Match a wider range of URI schemes
		
			
				
	
	
		
			56 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			56 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
 | 
						|
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
						|
 | 
						|
 | 
						|
# removing ° from the special icons to keep e.g. 99° as one token
 | 
						|
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 | 
						|
 | 
						|
_currency = r"\$¢£€¥฿"
 | 
						|
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
						|
_units = UNITS.replace("%", "")
 | 
						|
 | 
						|
_prefixes = (
 | 
						|
    LIST_PUNCT
 | 
						|
    + LIST_ELLIPSES
 | 
						|
    + LIST_QUOTES
 | 
						|
    + [_concat_icons]
 | 
						|
    + [r"[,.:](?=[{a}])".format(a=ALPHA)]
 | 
						|
)
 | 
						|
 | 
						|
_suffixes = (
 | 
						|
    LIST_PUNCT
 | 
						|
    + LIST_ELLIPSES
 | 
						|
    + LIST_QUOTES
 | 
						|
    + [_concat_icons]
 | 
						|
    + [
 | 
						|
        r"(?<=[0-9])\+",
 | 
						|
        r"(?<=°[FfCcKk])\.",
 | 
						|
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
 | 
						|
        r"(?<=[0-9])(?:{u})".format(u=_units),
 | 
						|
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
 | 
						|
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
 | 
						|
        ),
 | 
						|
        r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
 | 
						|
    ]
 | 
						|
)
 | 
						|
 | 
						|
_infixes = (
 | 
						|
    LIST_ELLIPSES
 | 
						|
    + [_concat_icons]
 | 
						|
    + [
 | 
						|
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
						|
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
						|
        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
 | 
						|
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
						|
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
						|
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
 | 
						|
    ]
 | 
						|
)
 | 
						|
 | 
						|
TOKENIZER_PREFIXES = _prefixes
 | 
						|
TOKENIZER_SUFFIXES = _suffixes
 | 
						|
TOKENIZER_INFIXES = _infixes
 |