spaCy/spacy/lang/tokenizer_exceptions.py

258 lines
3.4 KiB
Python
Raw Permalink Normal View History

import re
from ..symbols import NORM, ORTH
2020-06-21 23:38:04 +03:00
from .char_classes import ALPHA_LOWER
2017-05-08 16:55:52 +03:00
2017-03-05 01:13:11 +03:00
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
# A few mods to this regex to account for use cases represented in test_urls
2017-05-09 01:00:00 +03:00
URL_PATTERN = (
# fmt: off
2017-03-05 01:13:11 +03:00
r"^"
# protocol identifier (mods: make optional and expand schemes)
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
r"(?:(?:[\w\+\-\.]{2,})://)?"
# mailto:user or user:pass authentication
2017-03-05 01:13:11 +03:00
r"(?:\S+(?::\S*)?@)?"
r"(?:"
# IP address exclusion
# private & local networks
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
# MH: Do we really need this? Seems excessive, and seems to have caused
# Issue #957
2017-03-05 01:13:11 +03:00
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
2020-09-04 14:15:36 +03:00
r"(?:" # noqa: E131
r"(?:" # noqa: E131
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
2020-09-04 14:15:36 +03:00
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
r")?"
r"[A-Za-z0-9\u00a1-\uffff]\."
r")+"
2017-03-05 01:13:11 +03:00
# TLD identifier
# mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
# strings like "lower.Upper", which can be split on "." by infixes in some
# languages
r"(?:[" + ALPHA_LOWER + "]{2,63})"
2017-03-05 01:13:11 +03:00
r")"
# port number
r"(?::\d{2,5})?"
# resource path
r"(?:[/?#]\S*)?"
2017-03-05 01:13:11 +03:00
r"$"
# fmt: on
2017-03-05 01:13:11 +03:00
).strip()
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
2017-05-08 16:55:52 +03:00
BASE_EXCEPTIONS = {}
for exc_data in [
{ORTH: " "},
{ORTH: "\t"},
{ORTH: "\\t"},
{ORTH: "\n"},
{ORTH: "\\n"},
{ORTH: "\u2014"},
{ORTH: "\u00a0", NORM: " "},
]:
2017-11-02 01:02:45 +03:00
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
2017-05-08 16:55:52 +03:00
for orth in [
"'",
'\\")',
"<space>",
"''",
"C++",
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z.",
"ä.",
"ö.",
"ü.",
]:
2017-05-08 16:55:52 +03:00
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
emoticons = set(
r"""
2017-05-08 16:55:52 +03:00
:)
:-)
:))
:-))
:)))
:-)))
(:
(-:
=)
(=
:]
:-]
[:
[-:
[=
=]
2017-05-08 16:55:52 +03:00
:o)
(o:
:}
:-}
8)
8-)
(-8
;)
;-)
(;
(-;
:(
:-(
:((
:-((
:(((
:-(((
):
)-:
=(
>:(
:')
:'-)
:'(
:'-(
:/
:-/
=/
=|
:|
:-|
]=
=[
2017-05-08 16:55:52 +03:00
:1
:P
:-P
:p
:-p
:O
:-O
:o
:-o
:0
:-0
:()
>:o
:*
:-*
:3
:-3
=3
:>
:->
:X
:-X
:x
:-x
:D
:-D
;D
;-D
=D
xD
XD
xDD
XDD
8D
8-D
^_^
^__^
^___^
>.<
>.>
<.<
._.
;_;
-_-
-__-
v.v
V.V
v_v
V_V
o_o
o_O
O_o
O_O
0_o
o_0
0_0
o.O
O.o
O.O
o.o
0.0
o.0
0.o
@_@
<3
<33
<333
</3
(^_^)
(-_-)
(._.)
(>_<)
(*_*)
(¬_¬)
ಠ_ಠ
(ಠ_ಠ)
¯\()/¯
(°°
><(((*>
""".split()
)
2017-05-08 16:55:52 +03:00
for orth in emoticons:
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
# Moved from a suffix setting due to #9155 removing prefixes from consideration
# for lookbehinds
for u in "cfkCFK":
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]