2019-02-01 10:05:22 +03:00
|
|
|
|
import re
|
2017-01-03 20:17:57 +03:00
|
|
|
|
|
2023-06-14 18:48:41 +03:00
|
|
|
|
from ..symbols import NORM, ORTH
|
2020-06-21 23:38:04 +03:00
|
|
|
|
from .char_classes import ALPHA_LOWER
|
2017-05-08 16:55:52 +03:00
|
|
|
|
|
2017-03-05 01:13:11 +03:00
|
|
|
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
|
|
|
|
|
# A few mods to this regex to account for use cases represented in test_urls
|
2017-05-09 01:00:00 +03:00
|
|
|
|
URL_PATTERN = (
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# fmt: off
|
2017-03-05 01:13:11 +03:00
|
|
|
|
r"^"
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# protocol identifier (mods: make optional and expand schemes)
|
|
|
|
|
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
2019-10-05 14:00:09 +03:00
|
|
|
|
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
|
|
|
|
# mailto:user or user:pass authentication
|
2017-03-05 01:13:11 +03:00
|
|
|
|
r"(?:\S+(?::\S*)?@)?"
|
|
|
|
|
r"(?:"
|
|
|
|
|
# IP address exclusion
|
|
|
|
|
# private & local networks
|
|
|
|
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
|
|
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
|
|
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
|
|
|
# IP address dotted notation octets
|
|
|
|
|
# excludes loopback network 0.0.0.0
|
|
|
|
|
# excludes reserved space >= 224.0.0.0
|
|
|
|
|
# excludes network & broadcast addresses
|
|
|
|
|
# (first & last IP address of each class)
|
2017-04-07 16:54:25 +03:00
|
|
|
|
# MH: Do we really need this? Seems excessive, and seems to have caused
|
|
|
|
|
# Issue #957
|
2017-03-05 01:13:11 +03:00
|
|
|
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
|
|
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
|
|
|
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
|
|
|
r"|"
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# host & domain names
|
|
|
|
|
# mods: match is case-sensitive, so include [A-Z]
|
2020-09-04 14:15:36 +03:00
|
|
|
|
r"(?:" # noqa: E131
|
2021-06-28 13:03:29 +03:00
|
|
|
|
r"(?:" # noqa: E131
|
|
|
|
|
r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131
|
2020-09-04 14:15:36 +03:00
|
|
|
|
r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
|
|
|
|
|
r")?"
|
|
|
|
|
r"[A-Za-z0-9\u00a1-\uffff]\."
|
|
|
|
|
r")+"
|
2017-03-05 01:13:11 +03:00
|
|
|
|
# TLD identifier
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
|
|
|
|
|
# strings like "lower.Upper", which can be split on "." by infixes in some
|
|
|
|
|
# languages
|
|
|
|
|
r"(?:[" + ALPHA_LOWER + "]{2,63})"
|
2017-03-05 01:13:11 +03:00
|
|
|
|
r")"
|
|
|
|
|
# port number
|
|
|
|
|
r"(?::\d{2,5})?"
|
|
|
|
|
# resource path
|
2019-10-05 14:00:09 +03:00
|
|
|
|
r"(?:[/?#]\S*)?"
|
2017-03-05 01:13:11 +03:00
|
|
|
|
r"$"
|
2020-01-06 16:58:30 +03:00
|
|
|
|
# fmt: on
|
2017-03-05 01:13:11 +03:00
|
|
|
|
).strip()
|
2017-01-03 20:17:57 +03:00
|
|
|
|
|
2020-05-22 13:41:03 +03:00
|
|
|
|
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
2017-01-03 20:17:57 +03:00
|
|
|
|
|
2017-05-08 16:55:52 +03:00
|
|
|
|
|
|
|
|
|
BASE_EXCEPTIONS = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for exc_data in [
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: " "},
|
|
|
|
|
{ORTH: "\t"},
|
|
|
|
|
{ORTH: "\\t"},
|
|
|
|
|
{ORTH: "\n"},
|
|
|
|
|
{ORTH: "\\n"},
|
2018-12-30 15:15:23 +03:00
|
|
|
|
{ORTH: "\u2014"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: "\u00a0", NORM: " "},
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
]:
|
2017-11-02 01:02:45 +03:00
|
|
|
|
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
|
2017-05-08 16:55:52 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for orth in [
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
"'",
|
|
|
|
|
'\\")',
|
|
|
|
|
"<space>",
|
|
|
|
|
"''",
|
|
|
|
|
"C++",
|
|
|
|
|
"a.",
|
|
|
|
|
"b.",
|
|
|
|
|
"c.",
|
|
|
|
|
"d.",
|
|
|
|
|
"e.",
|
|
|
|
|
"f.",
|
|
|
|
|
"g.",
|
|
|
|
|
"h.",
|
|
|
|
|
"i.",
|
|
|
|
|
"j.",
|
|
|
|
|
"k.",
|
|
|
|
|
"l.",
|
|
|
|
|
"m.",
|
|
|
|
|
"n.",
|
|
|
|
|
"o.",
|
|
|
|
|
"p.",
|
|
|
|
|
"q.",
|
|
|
|
|
"r.",
|
|
|
|
|
"s.",
|
|
|
|
|
"t.",
|
|
|
|
|
"u.",
|
|
|
|
|
"v.",
|
|
|
|
|
"w.",
|
|
|
|
|
"x.",
|
|
|
|
|
"y.",
|
|
|
|
|
"z.",
|
|
|
|
|
"ä.",
|
|
|
|
|
"ö.",
|
|
|
|
|
"ü.",
|
|
|
|
|
]:
|
2017-05-08 16:55:52 +03:00
|
|
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
|
|
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
emoticons = set(
|
2019-08-18 16:17:13 +03:00
|
|
|
|
r"""
|
2017-05-08 16:55:52 +03:00
|
|
|
|
:)
|
|
|
|
|
:-)
|
|
|
|
|
:))
|
|
|
|
|
:-))
|
|
|
|
|
:)))
|
|
|
|
|
:-)))
|
|
|
|
|
(:
|
|
|
|
|
(-:
|
|
|
|
|
=)
|
|
|
|
|
(=
|
|
|
|
|
:]
|
|
|
|
|
:-]
|
|
|
|
|
[:
|
|
|
|
|
[-:
|
2020-08-24 15:30:05 +03:00
|
|
|
|
[=
|
|
|
|
|
=]
|
2017-05-08 16:55:52 +03:00
|
|
|
|
:o)
|
|
|
|
|
(o:
|
|
|
|
|
:}
|
|
|
|
|
:-}
|
|
|
|
|
8)
|
|
|
|
|
8-)
|
|
|
|
|
(-8
|
|
|
|
|
;)
|
|
|
|
|
;-)
|
|
|
|
|
(;
|
|
|
|
|
(-;
|
|
|
|
|
:(
|
|
|
|
|
:-(
|
|
|
|
|
:((
|
|
|
|
|
:-((
|
|
|
|
|
:(((
|
|
|
|
|
:-(((
|
|
|
|
|
):
|
|
|
|
|
)-:
|
|
|
|
|
=(
|
|
|
|
|
>:(
|
|
|
|
|
:')
|
|
|
|
|
:'-)
|
|
|
|
|
:'(
|
|
|
|
|
:'-(
|
|
|
|
|
:/
|
|
|
|
|
:-/
|
|
|
|
|
=/
|
|
|
|
|
=|
|
|
|
|
|
:|
|
|
|
|
|
:-|
|
2020-08-24 15:30:05 +03:00
|
|
|
|
]=
|
|
|
|
|
=[
|
2017-05-08 16:55:52 +03:00
|
|
|
|
:1
|
|
|
|
|
:P
|
|
|
|
|
:-P
|
|
|
|
|
:p
|
|
|
|
|
:-p
|
|
|
|
|
:O
|
|
|
|
|
:-O
|
|
|
|
|
:o
|
|
|
|
|
:-o
|
|
|
|
|
:0
|
|
|
|
|
:-0
|
|
|
|
|
:()
|
|
|
|
|
>:o
|
|
|
|
|
:*
|
|
|
|
|
:-*
|
|
|
|
|
:3
|
|
|
|
|
:-3
|
|
|
|
|
=3
|
|
|
|
|
:>
|
|
|
|
|
:->
|
|
|
|
|
:X
|
|
|
|
|
:-X
|
|
|
|
|
:x
|
|
|
|
|
:-x
|
|
|
|
|
:D
|
|
|
|
|
:-D
|
|
|
|
|
;D
|
|
|
|
|
;-D
|
|
|
|
|
=D
|
|
|
|
|
xD
|
|
|
|
|
XD
|
|
|
|
|
xDD
|
|
|
|
|
XDD
|
|
|
|
|
8D
|
|
|
|
|
8-D
|
|
|
|
|
|
|
|
|
|
^_^
|
|
|
|
|
^__^
|
|
|
|
|
^___^
|
|
|
|
|
>.<
|
|
|
|
|
>.>
|
|
|
|
|
<.<
|
|
|
|
|
._.
|
|
|
|
|
;_;
|
|
|
|
|
-_-
|
|
|
|
|
-__-
|
|
|
|
|
v.v
|
|
|
|
|
V.V
|
|
|
|
|
v_v
|
|
|
|
|
V_V
|
|
|
|
|
o_o
|
|
|
|
|
o_O
|
|
|
|
|
O_o
|
|
|
|
|
O_O
|
|
|
|
|
0_o
|
|
|
|
|
o_0
|
|
|
|
|
0_0
|
|
|
|
|
o.O
|
|
|
|
|
O.o
|
|
|
|
|
O.O
|
|
|
|
|
o.o
|
|
|
|
|
0.0
|
|
|
|
|
o.0
|
|
|
|
|
0.o
|
|
|
|
|
@_@
|
|
|
|
|
<3
|
|
|
|
|
<33
|
|
|
|
|
<333
|
|
|
|
|
</3
|
|
|
|
|
(^_^)
|
|
|
|
|
(-_-)
|
|
|
|
|
(._.)
|
|
|
|
|
(>_<)
|
|
|
|
|
(*_*)
|
|
|
|
|
(¬_¬)
|
|
|
|
|
ಠ_ಠ
|
|
|
|
|
ಠ︵ಠ
|
|
|
|
|
(ಠ_ಠ)
|
|
|
|
|
¯\(ツ)/¯
|
|
|
|
|
(╯°□°)╯︵┻━┻
|
|
|
|
|
><(((*>
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
""".split()
|
|
|
|
|
)
|
2017-05-08 16:55:52 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for orth in emoticons:
|
|
|
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
2021-10-27 14:02:25 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
|
|
|
|
# for lookbehinds
|
|
|
|
|
for u in "cfkCFK":
|
|
|
|
|
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
|