mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			250 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			250 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
 | 
						|
from .char_classes import ALPHA_LOWER
 | 
						|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 | 
						|
 | 
						|
 | 
						|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 | 
						|
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
 | 
						|
# A few mods to this regex to account for use cases represented in test_urls
 | 
						|
URL_PATTERN = (
 | 
						|
    # fmt: off
 | 
						|
    r"^"
 | 
						|
    # protocol identifier (mods: make optional and expand schemes)
 | 
						|
    # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
 | 
						|
    r"(?:(?:[\w\+\-\.]{2,})://)?"
 | 
						|
    # mailto:user or user:pass authentication
 | 
						|
    r"(?:\S+(?::\S*)?@)?"
 | 
						|
    r"(?:"
 | 
						|
    # IP address exclusion
 | 
						|
    # private & local networks
 | 
						|
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
 | 
						|
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
 | 
						|
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
 | 
						|
    # IP address dotted notation octets
 | 
						|
    # excludes loopback network 0.0.0.0
 | 
						|
    # excludes reserved space >= 224.0.0.0
 | 
						|
    # excludes network & broadcast addresses
 | 
						|
    # (first & last IP address of each class)
 | 
						|
    # MH: Do we really need this? Seems excessive, and seems to have caused
 | 
						|
    # Issue #957
 | 
						|
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
 | 
						|
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
 | 
						|
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
 | 
						|
    r"|"
 | 
						|
    # host & domain names
 | 
						|
    # mods: match is case-sensitive, so include [A-Z]
 | 
						|
      "(?:"  # noqa: E131
 | 
						|
        "(?:"
 | 
						|
          "[A-Za-z0-9\u00a1-\uffff]"
 | 
						|
          "[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
 | 
						|
        ")?"
 | 
						|
        "[A-Za-z0-9\u00a1-\uffff]\."
 | 
						|
      ")+"
 | 
						|
    # TLD identifier
 | 
						|
    # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
 | 
						|
    # strings like "lower.Upper", which can be split on "." by infixes in some
 | 
						|
    # languages
 | 
						|
    r"(?:[" + ALPHA_LOWER + "]{2,63})"
 | 
						|
    r")"
 | 
						|
    # port number
 | 
						|
    r"(?::\d{2,5})?"
 | 
						|
    # resource path
 | 
						|
    r"(?:[/?#]\S*)?"
 | 
						|
    r"$"
 | 
						|
    # fmt: on
 | 
						|
).strip()
 | 
						|
 | 
						|
TOKEN_MATCH = None
 | 
						|
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
 | 
						|
 | 
						|
 | 
						|
BASE_EXCEPTIONS = {}
 | 
						|
 | 
						|
 | 
						|
for exc_data in [
 | 
						|
    {ORTH: " ", POS: SPACE, TAG: "_SP"},
 | 
						|
    {ORTH: "\t", POS: SPACE, TAG: "_SP"},
 | 
						|
    {ORTH: "\\t", POS: SPACE, TAG: "_SP"},
 | 
						|
    {ORTH: "\n", POS: SPACE, TAG: "_SP"},
 | 
						|
    {ORTH: "\\n", POS: SPACE, TAG: "_SP"},
 | 
						|
    {ORTH: "\u2014"},
 | 
						|
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  ", TAG: "_SP"},
 | 
						|
]:
 | 
						|
    BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
 | 
						|
for orth in [
 | 
						|
    "'",
 | 
						|
    '\\")',
 | 
						|
    "<space>",
 | 
						|
    "''",
 | 
						|
    "C++",
 | 
						|
    "a.",
 | 
						|
    "b.",
 | 
						|
    "c.",
 | 
						|
    "d.",
 | 
						|
    "e.",
 | 
						|
    "f.",
 | 
						|
    "g.",
 | 
						|
    "h.",
 | 
						|
    "i.",
 | 
						|
    "j.",
 | 
						|
    "k.",
 | 
						|
    "l.",
 | 
						|
    "m.",
 | 
						|
    "n.",
 | 
						|
    "o.",
 | 
						|
    "p.",
 | 
						|
    "q.",
 | 
						|
    "r.",
 | 
						|
    "s.",
 | 
						|
    "t.",
 | 
						|
    "u.",
 | 
						|
    "v.",
 | 
						|
    "w.",
 | 
						|
    "x.",
 | 
						|
    "y.",
 | 
						|
    "z.",
 | 
						|
    "ä.",
 | 
						|
    "ö.",
 | 
						|
    "ü.",
 | 
						|
]:
 | 
						|
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 | 
						|
 | 
						|
 | 
						|
emoticons = set(
 | 
						|
    r"""
 | 
						|
:)
 | 
						|
:-)
 | 
						|
:))
 | 
						|
:-))
 | 
						|
:)))
 | 
						|
:-)))
 | 
						|
(:
 | 
						|
(-:
 | 
						|
=)
 | 
						|
(=
 | 
						|
:]
 | 
						|
:-]
 | 
						|
[:
 | 
						|
[-:
 | 
						|
:o)
 | 
						|
(o:
 | 
						|
:}
 | 
						|
:-}
 | 
						|
8)
 | 
						|
8-)
 | 
						|
(-8
 | 
						|
;)
 | 
						|
;-)
 | 
						|
(;
 | 
						|
(-;
 | 
						|
:(
 | 
						|
:-(
 | 
						|
:((
 | 
						|
:-((
 | 
						|
:(((
 | 
						|
:-(((
 | 
						|
):
 | 
						|
)-:
 | 
						|
=(
 | 
						|
>:(
 | 
						|
:')
 | 
						|
:'-)
 | 
						|
:'(
 | 
						|
:'-(
 | 
						|
:/
 | 
						|
:-/
 | 
						|
=/
 | 
						|
=|
 | 
						|
:|
 | 
						|
:-|
 | 
						|
:1
 | 
						|
:P
 | 
						|
:-P
 | 
						|
:p
 | 
						|
:-p
 | 
						|
:O
 | 
						|
:-O
 | 
						|
:o
 | 
						|
:-o
 | 
						|
:0
 | 
						|
:-0
 | 
						|
:()
 | 
						|
>:o
 | 
						|
:*
 | 
						|
:-*
 | 
						|
:3
 | 
						|
:-3
 | 
						|
=3
 | 
						|
:>
 | 
						|
:->
 | 
						|
:X
 | 
						|
:-X
 | 
						|
:x
 | 
						|
:-x
 | 
						|
:D
 | 
						|
:-D
 | 
						|
;D
 | 
						|
;-D
 | 
						|
=D
 | 
						|
xD
 | 
						|
XD
 | 
						|
xDD
 | 
						|
XDD
 | 
						|
8D
 | 
						|
8-D
 | 
						|
 | 
						|
^_^
 | 
						|
^__^
 | 
						|
^___^
 | 
						|
>.<
 | 
						|
>.>
 | 
						|
<.<
 | 
						|
._.
 | 
						|
;_;
 | 
						|
-_-
 | 
						|
-__-
 | 
						|
v.v
 | 
						|
V.V
 | 
						|
v_v
 | 
						|
V_V
 | 
						|
o_o
 | 
						|
o_O
 | 
						|
O_o
 | 
						|
O_O
 | 
						|
0_o
 | 
						|
o_0
 | 
						|
0_0
 | 
						|
o.O
 | 
						|
O.o
 | 
						|
O.O
 | 
						|
o.o
 | 
						|
0.0
 | 
						|
o.0
 | 
						|
0.o
 | 
						|
@_@
 | 
						|
<3
 | 
						|
<33
 | 
						|
<333
 | 
						|
</3
 | 
						|
(^_^)
 | 
						|
(-_-)
 | 
						|
(._.)
 | 
						|
(>_<)
 | 
						|
(*_*)
 | 
						|
(¬_¬)
 | 
						|
ಠ_ಠ
 | 
						|
ಠ︵ಠ
 | 
						|
(ಠ_ಠ)
 | 
						|
¯\(ツ)/¯
 | 
						|
(╯°□°)╯︵┻━┻
 | 
						|
><(((*>
 | 
						|
""".split()
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
for orth in emoticons:
 | 
						|
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 |