mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						13b516289b
					
				|  | @ -5,7 +5,7 @@ thinc==7.4.0.dev0 | ||||||
| blis>=0.4.0,<0.5.0 | blis>=0.4.0,<0.5.0 | ||||||
| murmurhash>=0.28.0,<1.1.0 | murmurhash>=0.28.0,<1.1.0 | ||||||
| wasabi>=0.4.0,<1.1.0 | wasabi>=0.4.0,<1.1.0 | ||||||
| srsly>=0.1.0,<1.1.0 | srsly>=1.0.1,<1.1.0 | ||||||
| catalogue>=0.0.7,<1.1.0 | catalogue>=0.0.7,<1.1.0 | ||||||
| # Third party dependencies | # Third party dependencies | ||||||
| numpy>=1.15.0 | numpy>=1.15.0 | ||||||
|  |  | ||||||
|  | @ -47,7 +47,7 @@ install_requires = | ||||||
|     thinc==7.4.0.dev0 |     thinc==7.4.0.dev0 | ||||||
|     blis>=0.4.0,<0.5.0 |     blis>=0.4.0,<0.5.0 | ||||||
|     wasabi>=0.4.0,<1.1.0 |     wasabi>=0.4.0,<1.1.0 | ||||||
|     srsly>=0.1.0,<1.1.0 |     srsly>=1.0.1,<1.1.0 | ||||||
|     catalogue>=0.0.7,<1.1.0 |     catalogue>=0.0.7,<1.1.0 | ||||||
|     # Third-party dependencies |     # Third-party dependencies | ||||||
|     tqdm>=4.38.0,<5.0.0 |     tqdm>=4.38.0,<5.0.0 | ||||||
|  |  | ||||||
|  | @ -1,12 +1,13 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
| from ..punctuation import TOKENIZER_SUFFIXES | from ..punctuation import TOKENIZER_SUFFIXES | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| _quotes = CONCAT_QUOTES.replace("'", "") | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
|  | DASHES = "|".join(x for x in LIST_HYPHENS if x != "-") | ||||||
| 
 | 
 | ||||||
| _infixes = ( | _infixes = ( | ||||||
|     LIST_ELLIPSES |     LIST_ELLIPSES | ||||||
|  | @ -14,11 +15,9 @@ _infixes = ( | ||||||
|     + [ |     + [ | ||||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||||
|         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), |  | ||||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), |  | ||||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), |         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES), | ||||||
|         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), |         r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), | ||||||
|     ] |     ] | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -31,6 +31,9 @@ for exc_data in [ | ||||||
|     {ORTH: "myöh.", LEMMA: "myöhempi"}, |     {ORTH: "myöh.", LEMMA: "myöhempi"}, | ||||||
|     {ORTH: "n.", LEMMA: "noin"}, |     {ORTH: "n.", LEMMA: "noin"}, | ||||||
|     {ORTH: "nimim.", LEMMA: "nimimerkki"}, |     {ORTH: "nimim.", LEMMA: "nimimerkki"}, | ||||||
|  |     {ORTH: "n:o", LEMMA: "numero"}, | ||||||
|  |     {ORTH: "N:o", LEMMA: "numero"}, | ||||||
|  |     {ORTH: "nro", LEMMA: "numero"}, | ||||||
|     {ORTH: "ns.", LEMMA: "niin sanottu"}, |     {ORTH: "ns.", LEMMA: "niin sanottu"}, | ||||||
|     {ORTH: "nyk.", LEMMA: "nykyinen"}, |     {ORTH: "nyk.", LEMMA: "nykyinen"}, | ||||||
|     {ORTH: "oik.", LEMMA: "oikealla"}, |     {ORTH: "oik.", LEMMA: "oikealla"}, | ||||||
|  |  | ||||||
|  | @ -19,6 +19,21 @@ HYPHENATED_TESTS = [ | ||||||
|     ) |     ) | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | ABBREVIATION_INFLECTION_TESTS = [ | ||||||
|  |     ( | ||||||
|  |         "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", | ||||||
|  |         ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "ALV:n osuus on 24 %.", | ||||||
|  |         ["ALV:n", "osuus", "on", "24", "%", "."] | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Hiihtäjä oli kilpailun 14:s.", | ||||||
|  |         ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] | ||||||
|  |     ) | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) | @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) | ||||||
| def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): | def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): | ||||||
|  | @ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens): | ||||||
|     tokens = fi_tokenizer(text) |     tokens = fi_tokenizer(text) | ||||||
|     token_list = [token.text for token in tokens if not token.is_space] |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|     assert expected_tokens == token_list |     assert expected_tokens == token_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS) | ||||||
|  | def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens): | ||||||
|  |     tokens = fi_tokenizer(text) | ||||||
|  |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|  |     assert expected_tokens == token_list | ||||||
|  |  | ||||||
|  | @ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. | ||||||
| ### Disabling the parser {#disabling} | ### Disabling the parser {#disabling} | ||||||
| 
 | 
 | ||||||
| In the [default models](/models), the parser is loaded and enabled as part of | In the [default models](/models), the parser is loaded and enabled as part of | ||||||
| the [standard processing pipeline](/usage/processing-pipelin). If you don't need | the [standard processing pipeline](/usage/processing-pipelines). If you don't need | ||||||
| any of the syntactic information, you should disable the parser. Disabling the | any of the syntactic information, you should disable the parser. Disabling the | ||||||
| parser will make spaCy load and run much faster. If you want to load the parser, | parser will make spaCy load and run much faster. If you want to load the parser, | ||||||
| but need to disable it for specific documents, you can also control its use on | but need to disable it for specific documents, you can also control its use on | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user