mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-07 15:56:32 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
b0d4899473
|
@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿"
|
_currency = r"\$¢£€¥฿"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
_units = UNITS.replace("%", "")
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
[r"\+"]
|
LIST_PUNCT
|
||||||
+ LIST_PUNCT
|
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ [_concat_icons]
|
+ [_concat_icons]
|
||||||
|
@ -29,7 +29,7 @@ _suffixes = (
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
),
|
),
|
||||||
|
|
|
@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
# A few minor mods to this regex to account for use cases represented in test_urls
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
||||||
URL_PATTERN = (
|
URL_PATTERN = (
|
||||||
r"^"
|
r"^"
|
||||||
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
# protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
||||||
r"(?=[\w])"
|
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
||||||
# protocol identifier
|
# mailto:user or user:pass authentication
|
||||||
r"(?:(?:https?|ftp|mailto)://)?"
|
|
||||||
# user:pass authentication
|
|
||||||
r"(?:\S+(?::\S*)?@)?"
|
r"(?:\S+(?::\S*)?@)?"
|
||||||
r"(?:"
|
r"(?:"
|
||||||
# IP address exclusion
|
# IP address exclusion
|
||||||
|
@ -43,11 +41,7 @@ URL_PATTERN = (
|
||||||
# port number
|
# port number
|
||||||
r"(?::\d{2,5})?"
|
r"(?::\d{2,5})?"
|
||||||
# resource path
|
# resource path
|
||||||
r"(?:/\S*)?"
|
r"(?:[/?#]\S*)?"
|
||||||
# query parameters
|
|
||||||
r"\??(:?\S*)?"
|
|
||||||
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
|
||||||
r"(?<=[\w/])"
|
|
||||||
r"$"
|
r"$"
|
||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
|
|
|
@ -375,7 +375,7 @@ cdef class Lexeme:
|
||||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
"""RETURNS (bool): Whether the lexeme consists of alphanumeric
|
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -111,7 +111,7 @@ TOKEN_PATTERN_SCHEMA = {
|
||||||
"$ref": "#/definitions/integer_value",
|
"$ref": "#/definitions/integer_value",
|
||||||
},
|
},
|
||||||
"IS_ALPHA": {
|
"IS_ALPHA": {
|
||||||
"title": "Token consists of alphanumeric characters",
|
"title": "Token consists of alphabetic characters",
|
||||||
"$ref": "#/definitions/boolean_value",
|
"$ref": "#/definitions/boolean_value",
|
||||||
},
|
},
|
||||||
"IS_ASCII": {
|
"IS_ASCII": {
|
||||||
|
|
|
@ -12,6 +12,7 @@ URLS_BASIC = [
|
||||||
|
|
||||||
URLS_FULL = URLS_BASIC + [
|
URLS_FULL = URLS_BASIC + [
|
||||||
"mailto:foo-bar@baz-co.com",
|
"mailto:foo-bar@baz-co.com",
|
||||||
|
"mailto:foo-bar@baz-co.com?subject=hi",
|
||||||
"www.google.com?q=google",
|
"www.google.com?q=google",
|
||||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||||
]
|
]
|
||||||
|
@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [
|
||||||
"http://a.b-c.de",
|
"http://a.b-c.de",
|
||||||
"http://223.255.255.254",
|
"http://223.255.255.254",
|
||||||
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||||
|
"ssh://login@server.com:12345/repository.git",
|
||||||
|
"svn+ssh://user@ssh.yourdomain.com/path",
|
||||||
|
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
||||||
|
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
||||||
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||||
|
@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [
|
||||||
"http:// shouldfail.com",
|
"http:// shouldfail.com",
|
||||||
":// should fail",
|
":// should fail",
|
||||||
"http://foo.bar/foo(bar)baz quux",
|
"http://foo.bar/foo(bar)baz quux",
|
||||||
"ftps://foo.bar/",
|
|
||||||
"http://-error-.invalid/",
|
"http://-error-.invalid/",
|
||||||
"http://a.b-.co",
|
"http://a.b-.co",
|
||||||
"http://0.0.0.0",
|
"http://0.0.0.0",
|
||||||
|
|
|
@ -227,7 +227,9 @@ cdef class Tokenizer:
|
||||||
cdef unicode minus_suf
|
cdef unicode minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
if self.token_match and self.token_match(string):
|
if self.token_match and self.token_match(string) \
|
||||||
|
and not self.find_prefix(string) \
|
||||||
|
and not self.find_suffix(string):
|
||||||
break
|
break
|
||||||
if self._specials.get(hash_string(string)) != NULL:
|
if self._specials.get(hash_string(string)) != NULL:
|
||||||
has_special[0] = 1
|
has_special[0] = 1
|
||||||
|
@ -243,8 +245,6 @@ cdef class Tokenizer:
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
has_special[0] = 1
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
if self.token_match and self.token_match(string):
|
|
||||||
break
|
|
||||||
suf_len = self.find_suffix(string)
|
suf_len = self.find_suffix(string)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
suffix = string[-suf_len:]
|
suffix = string[-suf_len:]
|
||||||
|
|
|
@ -163,7 +163,7 @@ rule-based matching are:
|
||||||
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. |
|
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. |
|
||||||
| `LOWER` | unicode | The lowercase form of the token text. |
|
| `LOWER` | unicode | The lowercase form of the token text. |
|
||||||
| `LENGTH` | int | The length of the token text. |
|
| `LENGTH` | int | The length of the token text. |
|
||||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphanumeric characters, ASCII characters, digits. |
|
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||||
|
|
|
@ -573,7 +573,7 @@ apple = doc[0]
|
||||||
print("Fine-grained POS tag", apple.pos_, apple.pos)
|
print("Fine-grained POS tag", apple.pos_, apple.pos)
|
||||||
print("Coarse-grained POS tag", apple.tag_, apple.tag)
|
print("Coarse-grained POS tag", apple.tag_, apple.tag)
|
||||||
print("Word shape", apple.shape_, apple.shape)
|
print("Word shape", apple.shape_, apple.shape)
|
||||||
print("Alphanumeric characters?", apple.is_alpha)
|
print("Alphabetic characters?", apple.is_alpha)
|
||||||
print("Punctuation mark?", apple.is_punct)
|
print("Punctuation mark?", apple.is_punct)
|
||||||
|
|
||||||
billion = doc[10]
|
billion = doc[10]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user