mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						b0d4899473
					
				| 
						 | 
				
			
			@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 | 
			
		|||
 | 
			
		||||
_currency = r"\$¢£€¥฿"
 | 
			
		||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
			
		||||
_units = UNITS.replace("%", "")
 | 
			
		||||
 | 
			
		||||
_prefixes = (
 | 
			
		||||
    [r"\+"]
 | 
			
		||||
    + LIST_PUNCT
 | 
			
		||||
    LIST_PUNCT
 | 
			
		||||
    + LIST_ELLIPSES
 | 
			
		||||
    + LIST_QUOTES
 | 
			
		||||
    + [_concat_icons]
 | 
			
		||||
| 
						 | 
				
			
			@ -29,7 +29,7 @@ _suffixes = (
 | 
			
		|||
        r"(?<=[0-9])\+",
 | 
			
		||||
        r"(?<=°[FfCcKk])\.",
 | 
			
		||||
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
 | 
			
		||||
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
 | 
			
		||||
        r"(?<=[0-9])(?:{u})".format(u=_units),
 | 
			
		||||
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
 | 
			
		||||
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
 | 
			
		||||
        ),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 | 
			
		|||
# A few minor mods to this regex to account for use cases represented in test_urls
 | 
			
		||||
URL_PATTERN = (
 | 
			
		||||
    r"^"
 | 
			
		||||
    # in order to support the prefix tokenization (see prefix test cases in test_urls).
 | 
			
		||||
    r"(?=[\w])"
 | 
			
		||||
    # protocol identifier
 | 
			
		||||
    r"(?:(?:https?|ftp|mailto)://)?"
 | 
			
		||||
    # user:pass authentication
 | 
			
		||||
    # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
 | 
			
		||||
    r"(?:(?:[\w\+\-\.]{2,})://)?"
 | 
			
		||||
    # mailto:user or user:pass authentication
 | 
			
		||||
    r"(?:\S+(?::\S*)?@)?"
 | 
			
		||||
    r"(?:"
 | 
			
		||||
    # IP address exclusion
 | 
			
		||||
| 
						 | 
				
			
			@ -43,11 +41,7 @@ URL_PATTERN = (
 | 
			
		|||
    # port number
 | 
			
		||||
    r"(?::\d{2,5})?"
 | 
			
		||||
    # resource path
 | 
			
		||||
    r"(?:/\S*)?"
 | 
			
		||||
    # query parameters
 | 
			
		||||
    r"\??(:?\S*)?"
 | 
			
		||||
    # in order to support the suffix tokenization (see suffix test cases in test_urls),
 | 
			
		||||
    r"(?<=[\w/])"
 | 
			
		||||
    r"(?:[/?#]\S*)?"
 | 
			
		||||
    r"$"
 | 
			
		||||
).strip()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -375,7 +375,7 @@ cdef class Lexeme:
 | 
			
		|||
            Lexeme.c_set_flag(self.c, IS_STOP, x)
 | 
			
		||||
 | 
			
		||||
    property is_alpha:
 | 
			
		||||
        """RETURNS (bool): Whether the lexeme consists of alphanumeric
 | 
			
		||||
        """RETURNS (bool): Whether the lexeme consists of alphabetic
 | 
			
		||||
            characters. Equivalent to `lexeme.text.isalpha()`.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ TOKEN_PATTERN_SCHEMA = {
 | 
			
		|||
                "$ref": "#/definitions/integer_value",
 | 
			
		||||
            },
 | 
			
		||||
            "IS_ALPHA": {
 | 
			
		||||
                "title": "Token consists of alphanumeric characters",
 | 
			
		||||
                "title": "Token consists of alphabetic characters",
 | 
			
		||||
                "$ref": "#/definitions/boolean_value",
 | 
			
		||||
            },
 | 
			
		||||
            "IS_ASCII": {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,6 +12,7 @@ URLS_BASIC = [
 | 
			
		|||
 | 
			
		||||
URLS_FULL = URLS_BASIC + [
 | 
			
		||||
    "mailto:foo-bar@baz-co.com",
 | 
			
		||||
    "mailto:foo-bar@baz-co.com?subject=hi",
 | 
			
		||||
    "www.google.com?q=google",
 | 
			
		||||
    "http://foo.com/blah_(wikipedia)#cite-1",
 | 
			
		||||
]
 | 
			
		||||
| 
						 | 
				
			
			@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [
 | 
			
		|||
    "http://a.b-c.de",
 | 
			
		||||
    "http://223.255.255.254",
 | 
			
		||||
    "http://a.b--c.de/",  # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
 | 
			
		||||
    "ssh://login@server.com:12345/repository.git",
 | 
			
		||||
    "svn+ssh://user@ssh.yourdomain.com/path",
 | 
			
		||||
    pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
 | 
			
		||||
    pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
 | 
			
		||||
    pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
 | 
			
		||||
    pytest.param(
 | 
			
		||||
        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
 | 
			
		||||
| 
						 | 
				
			
			@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [
 | 
			
		|||
    "http:// shouldfail.com",
 | 
			
		||||
    ":// should fail",
 | 
			
		||||
    "http://foo.bar/foo(bar)baz quux",
 | 
			
		||||
    "ftps://foo.bar/",
 | 
			
		||||
    "http://-error-.invalid/",
 | 
			
		||||
    "http://a.b-.co",
 | 
			
		||||
    "http://0.0.0.0",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -227,7 +227,9 @@ cdef class Tokenizer:
 | 
			
		|||
        cdef unicode minus_suf
 | 
			
		||||
        cdef size_t last_size = 0
 | 
			
		||||
        while string and len(string) != last_size:
 | 
			
		||||
            if self.token_match and self.token_match(string):
 | 
			
		||||
            if self.token_match and self.token_match(string) \
 | 
			
		||||
                    and not self.find_prefix(string) \
 | 
			
		||||
                    and not self.find_suffix(string):
 | 
			
		||||
                break
 | 
			
		||||
            if self._specials.get(hash_string(string)) != NULL:
 | 
			
		||||
                has_special[0] = 1
 | 
			
		||||
| 
						 | 
				
			
			@ -243,8 +245,6 @@ cdef class Tokenizer:
 | 
			
		|||
                    prefixes.push_back(self.vocab.get(mem, prefix))
 | 
			
		||||
                    has_special[0] = 1
 | 
			
		||||
                    break
 | 
			
		||||
                if self.token_match and self.token_match(string):
 | 
			
		||||
                    break
 | 
			
		||||
            suf_len = self.find_suffix(string)
 | 
			
		||||
            if suf_len != 0:
 | 
			
		||||
                suffix = string[-suf_len:]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -163,7 +163,7 @@ rule-based matching are:
 | 
			
		|||
| `TEXT` <Tag variant="new">2.1</Tag>    | unicode | The exact verbatim text of a token.                                                                    |
 | 
			
		||||
| `LOWER`                                | unicode | The lowercase form of the token text.                                                                  |
 | 
			
		||||
|  `LENGTH`                              | int     | The length of the token text.                                                                          |
 | 
			
		||||
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool    | Token text consists of alphanumeric characters, ASCII characters, digits.                              |
 | 
			
		||||
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool    | Token text consists of alphabetic characters, ASCII characters, digits.                                |
 | 
			
		||||
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool    | Token text is in lowercase, uppercase, titlecase.                                                      |
 | 
			
		||||
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool    | Token is punctuation, whitespace, stop word.                                                           |
 | 
			
		||||
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool    | Token text resembles a number, URL, email.                                                             |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -573,7 +573,7 @@ apple = doc[0]
 | 
			
		|||
print("Fine-grained POS tag", apple.pos_, apple.pos)
 | 
			
		||||
print("Coarse-grained POS tag", apple.tag_, apple.tag)
 | 
			
		||||
print("Word shape", apple.shape_, apple.shape)
 | 
			
		||||
print("Alphanumeric characters?", apple.is_alpha)
 | 
			
		||||
print("Alphabetic characters?", apple.is_alpha)
 | 
			
		||||
print("Punctuation mark?", apple.is_punct)
 | 
			
		||||
 | 
			
		||||
billion = doc[10]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user