mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			214 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
# The use of this module turns out to be important, to avoid pathological
 | 
						|
# back-tracking. See Issue #957
 | 
						|
import regex as re
 | 
						|
 | 
						|
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
 | 
						|
 | 
						|
 | 
						|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 | 
						|
# A few minor mods to this regex to account for use cases represented in test_urls
 | 
						|
URL_PATTERN = (
 | 
						|
    r"^"
 | 
						|
    # in order to support the prefix tokenization (see prefix test cases in test_urls).
 | 
						|
    r"(?=[\w])"
 | 
						|
    # protocol identifier
 | 
						|
    r"(?:(?:https?|ftp|mailto)://)?"
 | 
						|
    # user:pass authentication
 | 
						|
    r"(?:\S+(?::\S*)?@)?"
 | 
						|
    r"(?:"
 | 
						|
    # IP address exclusion
 | 
						|
    # private & local networks
 | 
						|
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
 | 
						|
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
 | 
						|
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
 | 
						|
    # IP address dotted notation octets
 | 
						|
    # excludes loopback network 0.0.0.0
 | 
						|
    # excludes reserved space >= 224.0.0.0
 | 
						|
    # excludes network & broadcast addresses
 | 
						|
    # (first & last IP address of each class)
 | 
						|
    # MH: Do we really need this? Seems excessive, and seems to have caused
 | 
						|
    # Issue #957
 | 
						|
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
 | 
						|
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
 | 
						|
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
 | 
						|
    r"|"
 | 
						|
    # host name
 | 
						|
    r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
 | 
						|
    # domain name
 | 
						|
    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
 | 
						|
    # TLD identifier
 | 
						|
    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
 | 
						|
    r")"
 | 
						|
    # port number
 | 
						|
    r"(?::\d{2,5})?"
 | 
						|
    # resource path
 | 
						|
    r"(?:/\S*)?"
 | 
						|
    # query parameters
 | 
						|
    r"\??(:?\S*)?"
 | 
						|
    # in order to support the suffix tokenization (see suffix test cases in test_urls),
 | 
						|
    r"(?<=[\w/])"
 | 
						|
    r"$"
 | 
						|
).strip()
 | 
						|
 | 
						|
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
 | 
						|
 | 
						|
 | 
						|
 | 
						|
BASE_EXCEPTIONS = {}
 | 
						|
 | 
						|
 | 
						|
for exc_data in [
 | 
						|
    {ORTH: " ", POS: SPACE},
 | 
						|
    {ORTH: "\t", POS: SPACE},
 | 
						|
    {ORTH: "\\t", POS: SPACE},
 | 
						|
    {ORTH: "\n", POS: SPACE},
 | 
						|
    {ORTH: "\\n", POS: SPACE},
 | 
						|
    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
 | 
						|
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
 | 
						|
    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
 | 
						|
 | 
						|
 | 
						|
for orth in [
 | 
						|
    "'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
 | 
						|
    "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
 | 
						|
    "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
 | 
						|
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 | 
						|
 | 
						|
 | 
						|
emoticons = set("""
 | 
						|
:)
 | 
						|
:-)
 | 
						|
:))
 | 
						|
:-))
 | 
						|
:)))
 | 
						|
:-)))
 | 
						|
(:
 | 
						|
(-:
 | 
						|
=)
 | 
						|
(=
 | 
						|
")
 | 
						|
:]
 | 
						|
:-]
 | 
						|
[:
 | 
						|
[-:
 | 
						|
:o)
 | 
						|
(o:
 | 
						|
:}
 | 
						|
:-}
 | 
						|
8)
 | 
						|
8-)
 | 
						|
(-8
 | 
						|
;)
 | 
						|
;-)
 | 
						|
(;
 | 
						|
(-;
 | 
						|
:(
 | 
						|
:-(
 | 
						|
:((
 | 
						|
:-((
 | 
						|
:(((
 | 
						|
:-(((
 | 
						|
):
 | 
						|
)-:
 | 
						|
=(
 | 
						|
>:(
 | 
						|
:')
 | 
						|
:'-)
 | 
						|
:'(
 | 
						|
:'-(
 | 
						|
:/
 | 
						|
:-/
 | 
						|
=/
 | 
						|
=|
 | 
						|
:|
 | 
						|
:-|
 | 
						|
:1
 | 
						|
:P
 | 
						|
:-P
 | 
						|
:p
 | 
						|
:-p
 | 
						|
:O
 | 
						|
:-O
 | 
						|
:o
 | 
						|
:-o
 | 
						|
:0
 | 
						|
:-0
 | 
						|
:()
 | 
						|
>:o
 | 
						|
:*
 | 
						|
:-*
 | 
						|
:3
 | 
						|
:-3
 | 
						|
=3
 | 
						|
:>
 | 
						|
:->
 | 
						|
:X
 | 
						|
:-X
 | 
						|
:x
 | 
						|
:-x
 | 
						|
:D
 | 
						|
:-D
 | 
						|
;D
 | 
						|
;-D
 | 
						|
=D
 | 
						|
xD
 | 
						|
XD
 | 
						|
xDD
 | 
						|
XDD
 | 
						|
8D
 | 
						|
8-D
 | 
						|
 | 
						|
^_^
 | 
						|
^__^
 | 
						|
^___^
 | 
						|
>.<
 | 
						|
>.>
 | 
						|
<.<
 | 
						|
._.
 | 
						|
;_;
 | 
						|
-_-
 | 
						|
-__-
 | 
						|
v.v
 | 
						|
V.V
 | 
						|
v_v
 | 
						|
V_V
 | 
						|
o_o
 | 
						|
o_O
 | 
						|
O_o
 | 
						|
O_O
 | 
						|
0_o
 | 
						|
o_0
 | 
						|
0_0
 | 
						|
o.O
 | 
						|
O.o
 | 
						|
O.O
 | 
						|
o.o
 | 
						|
0.0
 | 
						|
o.0
 | 
						|
0.o
 | 
						|
@_@
 | 
						|
<3
 | 
						|
<33
 | 
						|
<333
 | 
						|
</3
 | 
						|
(^_^)
 | 
						|
(-_-)
 | 
						|
(._.)
 | 
						|
(>_<)
 | 
						|
(*_*)
 | 
						|
(¬_¬)
 | 
						|
ಠ_ಠ
 | 
						|
ಠ︵ಠ
 | 
						|
(ಠ_ಠ)
 | 
						|
¯\(ツ)/¯
 | 
						|
(╯°□°)╯︵┻━┻
 | 
						|
><(((*>
 | 
						|
""".split())
 | 
						|
 | 
						|
 | 
						|
for orth in emoticons:
 | 
						|
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 |