mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	* Ignore prefix in suffix matches
Ignore the currently matched prefix when looking for suffix matches in
the tokenizer. Otherwise a lookbehind in the suffix pattern may match
incorrectly due the presence of the prefix in the token string.
* Move °[cfkCFK]. to a tokenizer exception
* Adjust exceptions for same tokenization as v3.1
* Also update test accordingly
* Continue to split . after °CFK if ° is not a prefix
* Exclude new ° exceptions for pl
* Switch back to default tokenization of "° C ."
* Revert "Exclude new ° exceptions for pl"
This reverts commit 952013a5b4.
* Add exceptions for °C for hu
		
	
			
		
			
				
	
	
		
			259 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			259 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import re
 | |
| 
 | |
| from .char_classes import ALPHA_LOWER
 | |
| from ..symbols import ORTH, NORM
 | |
| 
 | |
| 
 | |
| # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 | |
| # and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
 | |
| # A few mods to this regex to account for use cases represented in test_urls
 | |
| URL_PATTERN = (
 | |
|     # fmt: off
 | |
|     r"^"
 | |
|     # protocol identifier (mods: make optional and expand schemes)
 | |
|     # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
 | |
|     r"(?:(?:[\w\+\-\.]{2,})://)?"
 | |
|     # mailto:user or user:pass authentication
 | |
|     r"(?:\S+(?::\S*)?@)?"
 | |
|     r"(?:"
 | |
|     # IP address exclusion
 | |
|     # private & local networks
 | |
|     r"(?!(?:10|127)(?:\.\d{1,3}){3})"
 | |
|     r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
 | |
|     r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
 | |
|     # IP address dotted notation octets
 | |
|     # excludes loopback network 0.0.0.0
 | |
|     # excludes reserved space >= 224.0.0.0
 | |
|     # excludes network & broadcast addresses
 | |
|     # (first & last IP address of each class)
 | |
|     # MH: Do we really need this? Seems excessive, and seems to have caused
 | |
|     # Issue #957
 | |
|     r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
 | |
|     r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
 | |
|     r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
 | |
|     r"|"
 | |
|     # host & domain names
 | |
|     # mods: match is case-sensitive, so include [A-Z]
 | |
|     r"(?:"  # noqa: E131
 | |
|       r"(?:"  # noqa: E131
 | |
|         r"[A-Za-z0-9\u00a1-\uffff]"  # noqa: E131
 | |
|         r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
 | |
|       r")?"
 | |
|       r"[A-Za-z0-9\u00a1-\uffff]\."
 | |
|     r")+"
 | |
|     # TLD identifier
 | |
|     # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
 | |
|     # strings like "lower.Upper", which can be split on "." by infixes in some
 | |
|     # languages
 | |
|     r"(?:[" + ALPHA_LOWER + "]{2,63})"
 | |
|     r")"
 | |
|     # port number
 | |
|     r"(?::\d{2,5})?"
 | |
|     # resource path
 | |
|     r"(?:[/?#]\S*)?"
 | |
|     r"$"
 | |
|     # fmt: on
 | |
| ).strip()
 | |
| 
 | |
| URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
 | |
| 
 | |
| 
 | |
| BASE_EXCEPTIONS = {}
 | |
| 
 | |
| 
 | |
| for exc_data in [
 | |
|     {ORTH: " "},
 | |
|     {ORTH: "\t"},
 | |
|     {ORTH: "\\t"},
 | |
|     {ORTH: "\n"},
 | |
|     {ORTH: "\\n"},
 | |
|     {ORTH: "\u2014"},
 | |
|     {ORTH: "\u00a0", NORM: "  "},
 | |
| ]:
 | |
|     BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
 | |
| 
 | |
| 
 | |
| for orth in [
 | |
|     "'",
 | |
|     '\\")',
 | |
|     "<space>",
 | |
|     "''",
 | |
|     "C++",
 | |
|     "a.",
 | |
|     "b.",
 | |
|     "c.",
 | |
|     "d.",
 | |
|     "e.",
 | |
|     "f.",
 | |
|     "g.",
 | |
|     "h.",
 | |
|     "i.",
 | |
|     "j.",
 | |
|     "k.",
 | |
|     "l.",
 | |
|     "m.",
 | |
|     "n.",
 | |
|     "o.",
 | |
|     "p.",
 | |
|     "q.",
 | |
|     "r.",
 | |
|     "s.",
 | |
|     "t.",
 | |
|     "u.",
 | |
|     "v.",
 | |
|     "w.",
 | |
|     "x.",
 | |
|     "y.",
 | |
|     "z.",
 | |
|     "ä.",
 | |
|     "ö.",
 | |
|     "ü.",
 | |
| ]:
 | |
|     BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 | |
| 
 | |
| 
 | |
| emoticons = set(
 | |
|     r"""
 | |
| :)
 | |
| :-)
 | |
| :))
 | |
| :-))
 | |
| :)))
 | |
| :-)))
 | |
| (:
 | |
| (-:
 | |
| =)
 | |
| (=
 | |
| :]
 | |
| :-]
 | |
| [:
 | |
| [-:
 | |
| [=
 | |
| =]
 | |
| :o)
 | |
| (o:
 | |
| :}
 | |
| :-}
 | |
| 8)
 | |
| 8-)
 | |
| (-8
 | |
| ;)
 | |
| ;-)
 | |
| (;
 | |
| (-;
 | |
| :(
 | |
| :-(
 | |
| :((
 | |
| :-((
 | |
| :(((
 | |
| :-(((
 | |
| ):
 | |
| )-:
 | |
| =(
 | |
| >:(
 | |
| :')
 | |
| :'-)
 | |
| :'(
 | |
| :'-(
 | |
| :/
 | |
| :-/
 | |
| =/
 | |
| =|
 | |
| :|
 | |
| :-|
 | |
| ]=
 | |
| =[
 | |
| :1
 | |
| :P
 | |
| :-P
 | |
| :p
 | |
| :-p
 | |
| :O
 | |
| :-O
 | |
| :o
 | |
| :-o
 | |
| :0
 | |
| :-0
 | |
| :()
 | |
| >:o
 | |
| :*
 | |
| :-*
 | |
| :3
 | |
| :-3
 | |
| =3
 | |
| :>
 | |
| :->
 | |
| :X
 | |
| :-X
 | |
| :x
 | |
| :-x
 | |
| :D
 | |
| :-D
 | |
| ;D
 | |
| ;-D
 | |
| =D
 | |
| xD
 | |
| XD
 | |
| xDD
 | |
| XDD
 | |
| 8D
 | |
| 8-D
 | |
| 
 | |
| ^_^
 | |
| ^__^
 | |
| ^___^
 | |
| >.<
 | |
| >.>
 | |
| <.<
 | |
| ._.
 | |
| ;_;
 | |
| -_-
 | |
| -__-
 | |
| v.v
 | |
| V.V
 | |
| v_v
 | |
| V_V
 | |
| o_o
 | |
| o_O
 | |
| O_o
 | |
| O_O
 | |
| 0_o
 | |
| o_0
 | |
| 0_0
 | |
| o.O
 | |
| O.o
 | |
| O.O
 | |
| o.o
 | |
| 0.0
 | |
| o.0
 | |
| 0.o
 | |
| @_@
 | |
| <3
 | |
| <33
 | |
| <333
 | |
| </3
 | |
| (^_^)
 | |
| (-_-)
 | |
| (._.)
 | |
| (>_<)
 | |
| (*_*)
 | |
| (¬_¬)
 | |
| ಠ_ಠ
 | |
| ಠ︵ಠ
 | |
| (ಠ_ಠ)
 | |
| ¯\(ツ)/¯
 | |
| (╯°□°)╯︵┻━┻
 | |
| ><(((*>
 | |
| """.split()
 | |
| )
 | |
| 
 | |
| 
 | |
| for orth in emoticons:
 | |
|     BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 | |
| 
 | |
| 
 | |
| # Moved from a suffix setting due to #9155 removing prefixes from consideration
 | |
| # for lookbehinds
 | |
| for u in "cfkCFK":
 | |
|     BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
 |