mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						13b516289b
					
				| 
						 | 
				
			
			@ -5,7 +5,7 @@ thinc==7.4.0.dev0
 | 
			
		|||
blis>=0.4.0,<0.5.0
 | 
			
		||||
murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
wasabi>=0.4.0,<1.1.0
 | 
			
		||||
srsly>=0.1.0,<1.1.0
 | 
			
		||||
srsly>=1.0.1,<1.1.0
 | 
			
		||||
catalogue>=0.0.7,<1.1.0
 | 
			
		||||
# Third party dependencies
 | 
			
		||||
numpy>=1.15.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ install_requires =
 | 
			
		|||
    thinc==7.4.0.dev0
 | 
			
		||||
    blis>=0.4.0,<0.5.0
 | 
			
		||||
    wasabi>=0.4.0,<1.1.0
 | 
			
		||||
    srsly>=0.1.0,<1.1.0
 | 
			
		||||
    srsly>=1.0.1,<1.1.0
 | 
			
		||||
    catalogue>=0.0.7,<1.1.0
 | 
			
		||||
    # Third-party dependencies
 | 
			
		||||
    tqdm>=4.38.0,<5.0.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,12 +1,13 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 | 
			
		||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
 | 
			
		||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
			
		||||
from ..punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
			
		||||
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
 | 
			
		||||
 | 
			
		||||
_infixes = (
 | 
			
		||||
    LIST_ELLIPSES
 | 
			
		||||
| 
						 | 
				
			
			@ -14,11 +15,9 @@ _infixes = (
 | 
			
		|||
    + [
 | 
			
		||||
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
			
		||||
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
			
		||||
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
 | 
			
		||||
        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
    ]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -31,6 +31,9 @@ for exc_data in [
 | 
			
		|||
    {ORTH: "myöh.", LEMMA: "myöhempi"},
 | 
			
		||||
    {ORTH: "n.", LEMMA: "noin"},
 | 
			
		||||
    {ORTH: "nimim.", LEMMA: "nimimerkki"},
 | 
			
		||||
    {ORTH: "n:o", LEMMA: "numero"},
 | 
			
		||||
    {ORTH: "N:o", LEMMA: "numero"},
 | 
			
		||||
    {ORTH: "nro", LEMMA: "numero"},
 | 
			
		||||
    {ORTH: "ns.", LEMMA: "niin sanottu"},
 | 
			
		||||
    {ORTH: "nyk.", LEMMA: "nykyinen"},
 | 
			
		||||
    {ORTH: "oik.", LEMMA: "oikealla"},
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
 | 
			
		|||
    )
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
ABBREVIATION_INFLECTION_TESTS = [
 | 
			
		||||
    (
 | 
			
		||||
        "VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
 | 
			
		||||
        ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
 | 
			
		||||
    ),
 | 
			
		||||
    (
 | 
			
		||||
        "ALV:n osuus on 24 %.",
 | 
			
		||||
        ["ALV:n", "osuus", "on", "24", "%", "."]
 | 
			
		||||
    ),
 | 
			
		||||
    (
 | 
			
		||||
        "Hiihtäjä oli kilpailun 14:s.",
 | 
			
		||||
        ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
 | 
			
		||||
    )
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
 | 
			
		||||
def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
 | 
			
		||||
| 
						 | 
				
			
			@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
 | 
			
		|||
    tokens = fi_tokenizer(text)
 | 
			
		||||
    token_list = [token.text for token in tokens if not token.is_space]
 | 
			
		||||
    assert expected_tokens == token_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
 | 
			
		||||
def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
 | 
			
		||||
    tokens = fi_tokenizer(text)
 | 
			
		||||
    token_list = [token.text for token in tokens if not token.is_space]
 | 
			
		||||
    assert expected_tokens == token_list
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
 | 
			
		|||
### Disabling the parser {#disabling}
 | 
			
		||||
 | 
			
		||||
In the [default models](/models), the parser is loaded and enabled as part of
 | 
			
		||||
the [standard processing pipeline](/usage/processing-pipelin). If you don't need
 | 
			
		||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't need
 | 
			
		||||
any of the syntactic information, you should disable the parser. Disabling the
 | 
			
		||||
parser will make spaCy load and run much faster. If you want to load the parser,
 | 
			
		||||
but need to disable it for specific documents, you can also control its use on
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user