mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
5861308910
Handle tokenizer special cases more generally by using the Matcher internally to match special cases after the affix/token_match tokenization is complete. Instead of only matching special cases while processing balanced or nearly balanced prefixes and suffixes, this recognizes special cases in a wider range of contexts: * Allows arbitrary numbers of prefixes/affixes around special cases * Allows special cases separated by infixes Existing tests/settings that couldn't be preserved as before: * The emoticon '")' is no longer a supported special case * The emoticon ':)' in "example:)" is a false positive again When merged with #4258 (or the relevant cache bugfix), the affix and token_match properties should be modified to flush and reload all special cases to use the updated internal tokenization with the Matcher.
245 lines
3.1 KiB
Python
245 lines
3.1 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
|
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
|
|
|
|
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
|
URL_PATTERN = (
|
|
r"^"
|
|
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
|
r"(?=[\w])"
|
|
# protocol identifier
|
|
r"(?:(?:https?|ftp|mailto)://)?"
|
|
# user:pass authentication
|
|
r"(?:\S+(?::\S*)?@)?"
|
|
r"(?:"
|
|
# IP address exclusion
|
|
# private & local networks
|
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
# IP address dotted notation octets
|
|
# excludes loopback network 0.0.0.0
|
|
# excludes reserved space >= 224.0.0.0
|
|
# excludes network & broadcast addresses
|
|
# (first & last IP address of each class)
|
|
# MH: Do we really need this? Seems excessive, and seems to have caused
|
|
# Issue #957
|
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
r"|"
|
|
# host name
|
|
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
|
|
# domain name
|
|
r"(?:\.(?:[a-z0-9])(?:[a-z0-9\-])*[a-z0-9])?"
|
|
# TLD identifier
|
|
r"(?:\.(?:[a-z]{2,}))"
|
|
r")"
|
|
# port number
|
|
r"(?::\d{2,5})?"
|
|
# resource path
|
|
r"(?:/\S*)?"
|
|
# query parameters
|
|
r"\??(:?\S*)?"
|
|
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
|
r"(?<=[\w/])"
|
|
r"$"
|
|
).strip()
|
|
|
|
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
|
|
|
|
|
|
BASE_EXCEPTIONS = {}
|
|
|
|
|
|
for exc_data in [
|
|
{ORTH: " ", POS: SPACE, TAG: "_SP"},
|
|
{ORTH: "\t", POS: SPACE, TAG: "_SP"},
|
|
{ORTH: "\\t", POS: SPACE, TAG: "_SP"},
|
|
{ORTH: "\n", POS: SPACE, TAG: "_SP"},
|
|
{ORTH: "\\n", POS: SPACE, TAG: "_SP"},
|
|
{ORTH: "\u2014"},
|
|
{ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"},
|
|
]:
|
|
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
for orth in [
|
|
"'",
|
|
'\\")',
|
|
"<space>",
|
|
"''",
|
|
"C++",
|
|
"a.",
|
|
"b.",
|
|
"c.",
|
|
"d.",
|
|
"e.",
|
|
"f.",
|
|
"g.",
|
|
"h.",
|
|
"i.",
|
|
"j.",
|
|
"k.",
|
|
"l.",
|
|
"m.",
|
|
"n.",
|
|
"o.",
|
|
"p.",
|
|
"q.",
|
|
"r.",
|
|
"s.",
|
|
"t.",
|
|
"u.",
|
|
"v.",
|
|
"w.",
|
|
"x.",
|
|
"y.",
|
|
"z.",
|
|
"ä.",
|
|
"ö.",
|
|
"ü.",
|
|
]:
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
|
|
|
|
|
emoticons = set(
|
|
r"""
|
|
:)
|
|
:-)
|
|
:))
|
|
:-))
|
|
:)))
|
|
:-)))
|
|
(:
|
|
(-:
|
|
=)
|
|
(=
|
|
:]
|
|
:-]
|
|
[:
|
|
[-:
|
|
:o)
|
|
(o:
|
|
:}
|
|
:-}
|
|
8)
|
|
8-)
|
|
(-8
|
|
;)
|
|
;-)
|
|
(;
|
|
(-;
|
|
:(
|
|
:-(
|
|
:((
|
|
:-((
|
|
:(((
|
|
:-(((
|
|
):
|
|
)-:
|
|
=(
|
|
>:(
|
|
:')
|
|
:'-)
|
|
:'(
|
|
:'-(
|
|
:/
|
|
:-/
|
|
=/
|
|
=|
|
|
:|
|
|
:-|
|
|
:1
|
|
:P
|
|
:-P
|
|
:p
|
|
:-p
|
|
:O
|
|
:-O
|
|
:o
|
|
:-o
|
|
:0
|
|
:-0
|
|
:()
|
|
>:o
|
|
:*
|
|
:-*
|
|
:3
|
|
:-3
|
|
=3
|
|
:>
|
|
:->
|
|
:X
|
|
:-X
|
|
:x
|
|
:-x
|
|
:D
|
|
:-D
|
|
;D
|
|
;-D
|
|
=D
|
|
xD
|
|
XD
|
|
xDD
|
|
XDD
|
|
8D
|
|
8-D
|
|
|
|
^_^
|
|
^__^
|
|
^___^
|
|
>.<
|
|
>.>
|
|
<.<
|
|
._.
|
|
;_;
|
|
-_-
|
|
-__-
|
|
v.v
|
|
V.V
|
|
v_v
|
|
V_V
|
|
o_o
|
|
o_O
|
|
O_o
|
|
O_O
|
|
0_o
|
|
o_0
|
|
0_0
|
|
o.O
|
|
O.o
|
|
O.O
|
|
o.o
|
|
0.0
|
|
o.0
|
|
0.o
|
|
@_@
|
|
<3
|
|
<33
|
|
<333
|
|
</3
|
|
(^_^)
|
|
(-_-)
|
|
(._.)
|
|
(>_<)
|
|
(*_*)
|
|
(¬_¬)
|
|
ಠ_ಠ
|
|
ಠ︵ಠ
|
|
(ಠ_ಠ)
|
|
¯\(ツ)/¯
|
|
(╯°□°)╯︵┻━┻
|
|
><(((*>
|
|
""".split()
|
|
)
|
|
|
|
|
|
for orth in emoticons:
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|