mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'master' of github.com:pmbaumgartner/spaCy
This commit is contained in:
		
						commit
						040bb061fd
					
				|  | @ -5,6 +5,6 @@ requires = ["setuptools", | |||
|             "cymem>=2.0.2,<2.1.0", | ||||
|             "preshed>=2.0.1,<2.1.0", | ||||
|             "murmurhash>=0.28.0,<1.1.0", | ||||
|             "thinc>=7.0.6,<7.1.0", | ||||
|             "thinc>=7.0.8,<7.1.0", | ||||
|             ] | ||||
| build-backend = "setuptools.build_meta" | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| # Our libraries | ||||
| cymem>=2.0.2,<2.1.0 | ||||
| preshed>=2.0.1,<2.1.0 | ||||
| thinc>=7.0.6,<7.1.0 | ||||
| thinc>=7.0.8,<7.1.0 | ||||
| blis>=0.2.2,<0.3.0 | ||||
| murmurhash>=0.28.0,<1.1.0 | ||||
| wasabi>=0.2.0,<1.1.0 | ||||
|  |  | |||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -228,7 +228,7 @@ def setup_package(): | |||
|                 "murmurhash>=0.28.0,<1.1.0", | ||||
|                 "cymem>=2.0.2,<2.1.0", | ||||
|                 "preshed>=2.0.1,<2.1.0", | ||||
|                 "thinc>=7.0.6,<7.1.0", | ||||
|                 "thinc>=7.0.8,<7.1.0", | ||||
|                 "blis>=0.2.2,<0.3.0", | ||||
|                 "plac<1.0.0,>=0.9.6", | ||||
|                 "requests>=2.13.0,<3.0.0", | ||||
|  |  | |||
|  | @ -4,13 +4,13 @@ | |||
| # fmt: off | ||||
| 
 | ||||
| __title__ = "spacy" | ||||
| __version__ = "2.1.5.dev0" | ||||
| __version__ = "2.1.6" | ||||
| __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" | ||||
| __uri__ = "https://spacy.io" | ||||
| __author__ = "Explosion AI" | ||||
| __email__ = "contact@explosion.ai" | ||||
| __license__ = "MIT" | ||||
| __release__ = False | ||||
| __release__ = True | ||||
| 
 | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
|  |  | |||
|  | @ -1,4 +1,6 @@ | |||
| # Reserve 64 values for flag features | ||||
| from . cimport symbols | ||||
| 
 | ||||
| cdef enum attr_id_t: | ||||
|     NULL_ATTR | ||||
|     IS_ALPHA | ||||
|  | @ -82,10 +84,10 @@ cdef enum attr_id_t: | |||
|     DEP | ||||
|     ENT_IOB | ||||
|     ENT_TYPE | ||||
|     ENT_KB_ID | ||||
|     HEAD | ||||
|     SENT_START | ||||
|     SPACY | ||||
|     PROB | ||||
| 
 | ||||
|     LANG | ||||
|     ENT_KB_ID = symbols.ENT_KB_ID | ||||
|  |  | |||
|  | @ -14,10 +14,11 @@ _infixes = ( | |||
|     + [ | ||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||
|         r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||
|         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
|  |  | |||
|  | @ -52,6 +52,7 @@ for exc_data in [ | |||
|     {ORTH: "Ons.", LEMMA: "onsdag"}, | ||||
|     {ORTH: "Fre.", LEMMA: "fredag"}, | ||||
|     {ORTH: "Lør.", LEMMA: "lørdag"}, | ||||
|     {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"}, | ||||
| ]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
|  | @ -64,6 +65,8 @@ for orth in [ | |||
|     "mik.", | ||||
|     "pers.", | ||||
|     "A.D.", | ||||
|     "A/B", | ||||
|     "a/s", | ||||
|     "A/S", | ||||
|     "B.C.", | ||||
|     "BK.", | ||||
|  | @ -79,7 +82,9 @@ for orth in [ | |||
|     "Kprs.", | ||||
|     "L.A.", | ||||
|     "Ll.", | ||||
|     "m/k", | ||||
|     "m/s", | ||||
|     "m/sek.", | ||||
|     "M/S", | ||||
|     "Mag.", | ||||
|     "Mr.", | ||||
|  | @ -90,6 +95,7 @@ for orth in [ | |||
|     "Sdr.", | ||||
|     "Skt.", | ||||
|     "Spl.", | ||||
|     "TCP/IP", | ||||
|     "Vg.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
|  | @ -141,6 +147,7 @@ for orth in [ | |||
|     "brolægn.", | ||||
|     "bto.", | ||||
|     "bygn.", | ||||
|     "c/o", | ||||
|     "ca.", | ||||
|     "cand.", | ||||
|     "d.d.", | ||||
|  | @ -293,6 +300,7 @@ for orth in [ | |||
|     "kgl.", | ||||
|     "kl.", | ||||
|     "kld.", | ||||
|     "km/t", | ||||
|     "knsp.", | ||||
|     "komm.", | ||||
|     "kons.", | ||||
|  |  | |||
|  | @ -81,7 +81,6 @@ cdef enum symbol_t: | |||
|     DEP | ||||
|     ENT_IOB | ||||
|     ENT_TYPE | ||||
|     ENT_KB_ID | ||||
|     HEAD | ||||
|     SENT_START | ||||
|     SPACY | ||||
|  | @ -461,3 +460,5 @@ cdef enum symbol_t: | |||
|     xcomp | ||||
| 
 | ||||
|     acl | ||||
| 
 | ||||
|     ENT_KB_ID | ||||
|  |  | |||
|  | @ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): | |||
| def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): | ||||
|     tokens = da_tokenizer(text) | ||||
|     assert tokens[0].norm_ == norm | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,n_tokens", | ||||
|     [ | ||||
|         ("Godt og/eller skidt", 3), | ||||
|         ("Kør 4 km/t på vejen", 5), | ||||
|         ("Det blæser 12 m/s.", 5), | ||||
|         ("Det blæser 12 m/sek. på havnen", 6), | ||||
|         ("Windows 8/Windows 10", 5), | ||||
|         ("Billeten virker til bus/tog/metro", 8), | ||||
|         ("26/02/2019", 1), | ||||
|         ("Kristiansen c/o Madsen", 3), | ||||
|         ("Sprogteknologi a/s", 2), | ||||
|         ("De boede i A/B Bellevue", 5), | ||||
|         ("Rotorhastigheden er 3400 o/m.", 5), | ||||
|         ("Jeg købte billet t/r.", 5), | ||||
|         ("Murerarbejdsmand m/k søges", 3), | ||||
|         ("Netværket kører over TCP/IP", 4), | ||||
|     ], | ||||
| ) | ||||
| def test_da_tokenizer_slash(da_tokenizer, text, n_tokens): | ||||
|     tokens = da_tokenizer(text) | ||||
|     assert len(tokens) == n_tokens | ||||
|  |  | |||
							
								
								
									
										51
									
								
								spacy/tests/regression/test_issue3611.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								spacy/tests/regression/test_issue3611.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,51 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| import spacy | ||||
| from spacy.util import minibatch, compounding | ||||
| 
 | ||||
| 
 | ||||
| def test_issue3611(): | ||||
|     """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ | ||||
|     unique_classes = ["offensive", "inoffensive"] | ||||
|     x_train = ["This is an offensive text", | ||||
|                "This is the second offensive text", | ||||
|                "inoff"] | ||||
|     y_train = ["offensive", "offensive", "inoffensive"] | ||||
| 
 | ||||
|     # preparing the data | ||||
|     pos_cats = list() | ||||
|     for train_instance in y_train: | ||||
|         pos_cats.append({label: label == train_instance for label in unique_classes}) | ||||
|     train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats])) | ||||
| 
 | ||||
|     # set up the spacy model with a text categorizer component | ||||
|     nlp = spacy.blank('en') | ||||
| 
 | ||||
|     textcat = nlp.create_pipe( | ||||
|         "textcat", | ||||
|         config={ | ||||
|             "exclusive_classes": True, | ||||
|             "architecture": "bow", | ||||
|             "ngram_size": 2 | ||||
|         } | ||||
|     ) | ||||
| 
 | ||||
|     for label in unique_classes: | ||||
|         textcat.add_label(label) | ||||
|     nlp.add_pipe(textcat, last=True) | ||||
| 
 | ||||
|     # training the network | ||||
|     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] | ||||
|     with nlp.disable_pipes(*other_pipes): | ||||
|         optimizer = nlp.begin_training() | ||||
|         for i in range(3): | ||||
|             losses = {} | ||||
|             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) | ||||
| 
 | ||||
|             for batch in batches: | ||||
|                 texts, annotations = zip(*batch) | ||||
|                 nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -152,20 +152,21 @@ const Landing = ({ data }) => { | |||
|             <LandingBannerGrid> | ||||
|                 <LandingBanner | ||||
|                     title="spaCy IRL 2019: Two days of NLP" | ||||
|                     label="Join us in Berlin" | ||||
|                     to="https://irl.spacy.io/2019" | ||||
|                     button="Get tickets" | ||||
|                     label="Watch the videos" | ||||
|                     to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc" | ||||
|                     button="Watch the videos" | ||||
|                     background="#ffc194" | ||||
|                     backgroundImage={irlBackground} | ||||
|                     color="#1a1e23" | ||||
|                     small | ||||
|                 > | ||||
|                     We're pleased to invite the spaCy community and other folks working on Natural | ||||
|                     We were pleased to invite the spaCy community and other folks working on Natural | ||||
|                     Language Processing to Berlin this summer for a small and intimate event{' '} | ||||
|                     <strong>July 5-6, 2019</strong>. The event includes a hands-on training day for | ||||
|                     teams using spaCy in production, followed by a one-track conference. We've | ||||
|                     booked a beautiful venue, hand-picked an awesome lineup of speakers and | ||||
|                     scheduled plenty of social time to get to know each other and exchange ideas. | ||||
|                     <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an | ||||
|                     awesome lineup of speakers and scheduled plenty of social time to get to know | ||||
|                     each other and exchange ideas. The YouTube playlist includes 12 talks about NLP | ||||
|                     research, development and applications, with keynotes by Sebastian Ruder | ||||
|                     (DeepMind) and Yoav Goldberg (Allen AI). | ||||
|                 </LandingBanner> | ||||
| 
 | ||||
|                 <LandingBanner | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user