ines 
							
						 
					 
					
						
						
						
						
							
						
						
							aa92d4e9b5 
							
						 
					 
					
						
						
							
							Fix unicode regex for Python 2 (see  #834 )  
						
						
						
					 
					
						2017-02-16 23:49:54 +01:00 
						 
				 
			
				
					
						
							
							
								ines 
							
						 
					 
					
						
						
						
						
							
						
						
							85d249d451 
							
						 
					 
					
						
						
							
							Revert "Revert "Merge pull request  #836  from raphael0202/load_vectors ( closes   #834 )""  
						
						... 
						
						
						
						This reverts commit ea05f78660 
						
					 
					
						2017-02-16 23:26:25 +01:00 
						 
				 
			
				
					
						
							
							
								ines 
							
						 
					 
					
						
						
						
						
							
						
						
							ea05f78660 
							
						 
					 
					
						
						
							
							Revert "Merge pull request  #836  from raphael0202/load_vectors ( closes   #834 )"  
						
						... 
						
						
						
						This reverts commit 7d8c9eee7ff6b69babcc 
						
					 
					
						2017-02-16 15:27:12 +01:00 
						 
				 
			
				
					
						
							
							
								Raphaël Bournhonesque 
							
						 
					 
					
						
						
						
						
							
						
						
							e17dc2db75 
							
						 
					 
					
						
						
							
							Remove useless import  
						
						
						
					 
					
						2017-02-16 12:10:24 +01:00 
						 
				 
			
				
					
						
							
							
								Raphaël Bournhonesque 
							
						 
					 
					
						
						
						
						
							
						
						
							3fd2742649 
							
						 
					 
					
						
						
							
							load_vectors should accept arbitrary space characters as word tokens  
						
						... 
						
						
						
						Fix bug  #834  
						
					 
					
						2017-02-16 12:08:30 +01:00 
						 
				 
			
				
					
						
							
							
								Daniel Hershcovich 
							
						 
					 
					
						
						
						
						
							
						
						
							99eb494a82 
							
						 
					 
					
						
						
							
							Fix   #737 : support loading word vectors with " " as a word  
						
						
						
					 
					
						2017-01-12 17:00:14 +02:00 
						 
				 
			
				
					
						
							
							
								Daniel Hershcovich 
							
						 
					 
					
						
						
						
						
							
						
						
							8e603cc917 
							
						 
					 
					
						
						
							
							Avoid "True if ... else False"  
						
						
						
					 
					
						2017-01-11 11:18:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cade536d1e 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/explosion/spaCy  
						
						
						
					 
					
						2016-12-27 21:04:10 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce4539dafd 
							
						 
					 
					
						
						
							
							Allow the vocabulary to grow to 10,000, to prevent cold-start problem.  
						
						
						
					 
					
						2016-12-27 21:03:45 +01:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							8978806ea6 
							
						 
					 
					
						
						
							
							Allow Vocab to load without serializer_freqs  
						
						
						
					 
					
						2016-12-21 18:05:23 +01:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							be8ed811f6 
							
						 
					 
					
						
						
							
							Remove trailing whitespace  
						
						
						
					 
					
						2016-12-21 18:04:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ee1df93c5 
							
						 
					 
					
						
						
							
							Set tag_map to None if it's not seen in the data by vocab  
						
						
						
					 
					
						2016-12-18 16:51:10 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1e0f566d95 
							
						 
					 
					
						
						
							
							Fix   #656 ,  #624 : Support arbitrary token attributes when adding special-case rules.  
						
						
						
					 
					
						2016-11-25 12:43:24 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f123f92e0c 
							
						 
					 
					
						
						
							
							Fix   #617 : Vocab.load() required Path. Should work with string as well.  
						
						
						
					 
					
						2016-11-10 22:48:48 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b86f8af0c1 
							
						 
					 
					
						
						
							
							Fix doc strings  
						
						
						
					 
					
						2016-11-01 12:25:36 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6036ec7c77 
							
						 
					 
					
						
						
							
							Fix vector norm when loading lexemes.  
						
						
						
					 
					
						2016-10-23 19:40:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3e688e6d4b 
							
						 
					 
					
						
						
							
							Fix issue  #514  -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness.  
						
						
						
					 
					
						2016-10-23 17:45:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f62088d646 
							
						 
					 
					
						
						
							
							Fix compile error  
						
						
						
					 
					
						2016-10-23 14:50:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a0a4ada42a 
							
						 
					 
					
						
						
							
							Fix calculation of L2-norm for Lexeme  
						
						
						
					 
					
						2016-10-23 14:44:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7ab03050d4 
							
						 
					 
					
						
						
							
							Add resize_vectors method to Vocab  
						
						
						
					 
					
						2016-10-21 01:44:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5fe4f595b 
							
						 
					 
					
						
						
							
							Fix json loading, for Python 3.  
						
						
						
					 
					
						2016-10-20 21:23:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5ec32f5d97 
							
						 
					 
					
						
						
							
							Fix loading of GloVe vectors, to address Issue  #541  
						
						
						
					 
					
						2016-10-20 18:27:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d10c17f2a4 
							
						 
					 
					
						
						
							
							Fix Issue  #536 : oov_prob was 0 for OOV words.  
						
						
						
					 
					
						2016-10-19 23:38:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2bbb050500 
							
						 
					 
					
						
						
							
							Fix default of serializer_freqs  
						
						
						
					 
					
						2016-10-18 19:55:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2cc515b2ed 
							
						 
					 
					
						
						
							
							Add add_flag method to Vocab, re Issue  #504 .  
						
						
						
					 
					
						2016-10-14 12:15:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea23b64cc8 
							
						 
					 
					
						
						
							
							Refactor training, with new spacy.train module. Defaults still a little awkward.  
						
						
						
					 
					
						2016-10-09 12:24:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ca32a1ab01 
							
						 
					 
					
						
						
							
							Revert "Work on Issue  #285 : intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good."  
						
						... 
						
						
						
						This reverts commit 8423e8627f 
						
					 
					
						2016-09-30 20:20:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1f1cd5013f 
							
						 
					 
					
						
						
							
							Revert "Changes to vocab for new stringstore scheme"  
						
						... 
						
						
						
						This reverts commit a51149a717 
						
					 
					
						2016-09-30 20:10:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a51149a717 
							
						 
					 
					
						
						
							
							Changes to vocab for new stringstore scheme  
						
						
						
					 
					
						2016-09-30 20:01:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8423e8627f 
							
						 
					 
					
						
						
							
							Work on Issue  #285 : intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good.  
						
						
						
					 
					
						2016-09-30 10:14:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							95aaea0d3f 
							
						 
					 
					
						
						
							
							Refactor so that the tokenizer data is read from Python data, rather than from disk  
						
						
						
					 
					
						2016-09-25 14:49:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df88690177 
							
						 
					 
					
						
						
							
							Fix encoding of path variable  
						
						
						
					 
					
						2016-09-24 21:13:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af847e07fc 
							
						 
					 
					
						
						
							
							Fix usage of pathlib for Python3 -- turning paths to strings.  
						
						
						
					 
					
						2016-09-24 21:05:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							453683aaf0 
							
						 
					 
					
						
						
							
							Fix spacy/vocab.pyx  
						
						
						
					 
					
						2016-09-24 20:50:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd65cf6cbb 
							
						 
					 
					
						
						
							
							Finish refactoring data loading  
						
						
						
					 
					
						2016-09-24 20:26:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							83e364188c 
							
						 
					 
					
						
						
							
							Mostly finished loading refactoring. Design is in place, but doesn't work yet.  
						
						
						
					 
					
						2016-09-24 15:42:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							872695759d 
							
						 
					 
					
						
						
							
							Merge pull request  #306  from wbwseeker/german_noun_chunks  
						
						... 
						
						
						
						add German noun chunk functionality 
						
					 
					
						2016-04-08 00:54:24 +10:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							b8f63071eb 
							
						 
					 
					
						
						
							
							add lang registration facility  
						
						
						
					 
					
						2016-03-25 18:54:45 +01:00 
						 
				 
			
				
					
						
							
							
								Wolfgang Seeker 
							
						 
					 
					
						
						
						
						
							
						
						
							5e2e8e951a 
							
						 
					 
					
						
						
							
							add baseclass DocIterator for iterators over documents  
						
						... 
						
						
						
						add classes for English and German noun chunks
the respective iterators are set for the document when created by the parser
as they depend on the annotation scheme of the parsing model 
						
					 
					
						2016-03-16 15:53:35 +01:00 
						 
				 
			
				
					
						
							
							
								Wolfgang Seeker 
							
						 
					 
					
						
						
						
						
							
						
						
							03fb498dbe 
							
						 
					 
					
						
						
							
							introduce lang field for LexemeC to hold language id  
						
						... 
						
						
						
						put noun_chunk logic into iterators.py for each language separately 
						
					 
					
						2016-03-10 13:01:34 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							963fe5258e 
							
						 
					 
					
						
						
							
							* Add missing __contains__ method to vocab  
						
						
						
					 
					
						2016-03-08 15:49:10 +00:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							478aa21cb0 
							
						 
					 
					
						
						
							
							* Remove broken __reduce__ method on vocab  
						
						
						
					 
					
						2016-03-08 15:48:21 +00:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							931c07a609 
							
						 
					 
					
						
						
							
							initial proposal for separate vector package  
						
						
						
					 
					
						2016-03-04 11:09:06 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a95974ad3f 
							
						 
					 
					
						
						
							
							* Fix oov probability  
						
						
						
					 
					
						2016-02-06 15:13:55 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dcb401f3e1 
							
						 
					 
					
						
						
							
							* Remove broken Vocab pickling  
						
						
						
					 
					
						2016-02-06 14:08:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							63e3d4e27f 
							
						 
					 
					
						
						
							
							* Add comment on Vocab.__reduce__  
						
						
						
					 
					
						2016-01-19 20:11:25 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							235f094534 
							
						 
					 
					
						
						
							
							untangle data_path/via  
						
						
						
					 
					
						2016-01-16 12:23:45 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							846fa49b2a 
							
						 
					 
					
						
						
							
							distinct load() and from_package() methods  
						
						
						
					 
					
						2016-01-16 10:00:57 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							788f734513 
							
						 
					 
					
						
						
							
							refactored data_dir->via, add zip_safe, add spacy.load()  
						
						
						
					 
					
						2016-01-15 18:01:02 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							bc229790ac 
							
						 
					 
					
						
						
							
							integrate with sputnik  
						
						
						
					 
					
						2016-01-13 19:46:17 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							eaf2ad59f1 
							
						 
					 
					
						
						
							
							* Fix use of mock Package object  
						
						
						
					 
					
						2015-12-31 04:13:15 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aec130af56 
							
						 
					 
					
						
						
							
							Use util.Package class for io  
						
						... 
						
						
						
						Previous Sputnik integration caused API change: Vocab, Tagger, etc
were loaded via a from_package classmethod, that required a
sputnik.Package instance. This forced users to first create a
sputnik.Sputnik() instance, in order to acquire a Package via
sp.pool().
Instead I've created a small file-system shim, util.Package, which
allows classes to have a .load() classmethod, that accepts either
util.Package objects, or strings. We can later gut the internals
of this and make it a proxy for Sputnik if we need more functionality
that should live in the Sputnik library.
Sputnik is now only used to download and install the data, in
spacy.en.download 
						
					 
					
						2015-12-29 18:00:48 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0e2498da00 
							
						 
					 
					
						
						
							
							* Replace from_package with load() classmethod in Vocab  
						
						
						
					 
					
						2015-12-29 16:56:51 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							8359bd4d93 
							
						 
					 
					
						
						
							
							strip data/ from package, friendlier Language invocation, make data_dir backward/forward-compatible  
						
						
						
					 
					
						2015-12-18 09:52:55 +01:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							9027cef3bc 
							
						 
					 
					
						
						
							
							access model via sputnik  
						
						
						
					 
					
						2015-12-07 06:01:28 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ed3aedf79 
							
						 
					 
					
						
						
							
							* Merge vocab changes  
						
						
						
					 
					
						2015-11-06 00:48:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1e99fcd413 
							
						 
					 
					
						
						
							
							* Rename .repvec to .vector in C API  
						
						
						
					 
					
						2015-11-03 23:47:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5887506f5d 
							
						 
					 
					
						
						
							
							* Don't expect lexemes.bin in Vocab  
						
						
						
					 
					
						2015-11-03 13:23:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f11030aadc 
							
						 
					 
					
						
						
							
							* Remove out-dated TODO comment  
						
						
						
					 
					
						2015-10-26 12:33:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a371a1071d 
							
						 
					 
					
						
						
							
							* Save and load word vectors during pickling, re Issue  #125  
						
						
						
					 
					
						2015-10-26 12:33:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							314090cc78 
							
						 
					 
					
						
						
							
							* Set vectors length when unpickling vocab, re Issue  #125  
						
						
						
					 
					
						2015-10-26 12:05:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2348a08481 
							
						 
					 
					
						
						
							
							* Load/dump strings with a json file, instead of the hacky strings file we were using.  
						
						
						
					 
					
						2015-10-22 21:13:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7a15d1b60c 
							
						 
					 
					
						
						
							
							* Add Python 2/3 compatibility fix for copy_reg  
						
						
						
					 
					
						2015-10-13 20:04:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							20fd36a0f7 
							
						 
					 
					
						
						
							
							* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue  #125 : allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve.  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8de403483 
							
						 
					 
					
						
						
							
							* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue  #125  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85e7944572 
							
						 
					 
					
						
						
							
							* Start trying to pickle Vocab  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							41012907a8 
							
						 
					 
					
						
						
							
							* Fix variable name  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							37b909b6b6 
							
						 
					 
					
						
						
							
							* Use the symbols file in vocab instead of the symbols subfiles like attrs.pxd  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d70e8cac2c 
							
						 
					 
					
						
						
							
							* Fix empty values in attributes and parts of speech, so symbols align correctly with the StringStore  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a29c8ee23d 
							
						 
					 
					
						
						
							
							* Add symbols to the vocab before reading the strings, so that they line up correctly  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85ce36ab11 
							
						 
					 
					
						
						
							
							* Refactor symbols, so that frequency rank can be derived from the orth id of a word.  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							83dccf0fd7 
							
						 
					 
					
						
						
							
							* Use io module insteads of deprecated codecs module  
						
						
						
					 
					
						2015-10-10 14:13:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d9f41c2c9 
							
						 
					 
					
						
						
							
							* Add LookupError for better error reporting in Vocab  
						
						
						
					 
					
						2015-10-06 10:34:59 +11:00 
						 
				 
			
				
					
						
							
							
								alvations 
							
						 
					 
					
						
						
						
						
							
						
						
							8caedba42a 
							
						 
					 
					
						
						
							
							caught more codecs.open -> io.open  
						
						
						
					 
					
						2015-09-30 20:20:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abf0d930af 
							
						 
					 
					
						
						
							
							* Fix API for loading word vectors from a file.  
						
						
						
					 
					
						2015-09-23 23:51:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7283a5067 
							
						 
					 
					
						
						
							
							* Fix vectors bugs for OOV words  
						
						
						
					 
					
						2015-09-22 02:10:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ac459278d1 
							
						 
					 
					
						
						
							
							* Fix vector length error reporting, and ensure vec_len is returned  
						
						
						
					 
					
						2015-09-21 18:08:32 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ba4e563701 
							
						 
					 
					
						
						
							
							* Ensure vectors are same length, and return vector length in load_vectors_bz2  
						
						
						
					 
					
						2015-09-21 18:03:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6945bf880 
							
						 
					 
					
						
						
							
							* Add way to load vectors from bz2 file to vocab  
						
						
						
					 
					
						2015-09-17 12:58:23 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d87519f64 
							
						 
					 
					
						
						
							
							* Remove vectors argument from Vocab object  
						
						
						
					 
					
						2015-09-15 14:47:14 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							27f988b167 
							
						 
					 
					
						
						
							
							* Remove the vectors option to Vocab, preferring to either load vectors from disk, or set them on the Lexeme objects.  
						
						
						
					 
					
						2015-09-15 14:41:48 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e9c59693ea 
							
						 
					 
					
						
						
							
							* Remove assertion from vocab.pyx  
						
						
						
					 
					
						2015-09-13 10:30:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1dfaeed8a 
							
						 
					 
					
						
						
							
							* Check serializer freqs exist before loading  
						
						
						
					 
					
						2015-09-12 23:49:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a412c66c8c 
							
						 
					 
					
						
						
							
							* Check serializer freqs exist before loading  
						
						
						
					 
					
						2015-09-12 23:40:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e285ca7d6c 
							
						 
					 
					
						
						
							
							* Load serializer freqs in vocab  
						
						
						
					 
					
						2015-09-10 15:22:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							094440f9f5 
							
						 
					 
					
						
						
							
							Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop  
						
						
						
					 
					
						2015-09-10 14:51:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							90da3a695d 
							
						 
					 
					
						
						
							
							* Load lemmatizer from disk in Vocab.from_dir  
						
						
						
					 
					
						2015-09-10 14:49:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f634191e27 
							
						 
					 
					
						
						
							
							* Fix vocab read/write  
						
						
						
					 
					
						2015-09-10 14:44:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7f4b26c8c 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2015-09-09 14:33:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6561988cf 
							
						 
					 
					
						
						
							
							* Fix lexemes.bin  
						
						
						
					 
					
						2015-09-09 11:49:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c301bebd33 
							
						 
					 
					
						
						
							
							Merge branch 'master' of  https://github.com/honnibal/spaCy  into develop  
						
						
						
					 
					
						2015-09-09 10:55:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							623329b19a 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy into develop  
						
						
						
					 
					
						2015-09-08 14:27:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							62a01dd41d 
							
						 
					 
					
						
						
							
							* Fix issue  #92 : lexemes.bin read error on 32-bit platforms.  
						
						
						
					 
					
						2015-09-08 14:23:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f6ec5bf1b0 
							
						 
					 
					
						
						
							
							* Use empty tag map in vocab if none supplied  
						
						
						
					 
					
						2015-09-06 20:19:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							534e3dda3c 
							
						 
					 
					
						
						
							
							* More work on language independent parsing  
						
						
						
					 
					
						2015-08-28 03:44:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c2307fa9ee 
							
						 
					 
					
						
						
							
							* More work on language-generic parsing  
						
						
						
					 
					
						2015-08-28 02:02:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1302d35dff 
							
						 
					 
					
						
						
							
							* Rework interfaces in vocab  
						
						
						
					 
					
						2015-08-26 19:21:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6f1743692a 
							
						 
					 
					
						
						
							
							* Work on language-independent refactoring  
						
						
						
					 
					
						2015-08-23 20:49:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cad0cca4e3 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2015-08-22 22:04:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d43f49f69 
							
						 
					 
					
						
						
							
							* Revert prev change  
						
						
						
					 
					
						2015-07-27 10:58:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b586cdad4 
							
						 
					 
					
						
						
							
							* Change lexemes.bin format. Add a header specifying size of LexemeC and number of lexemes, and don't have the redundant orth information.  
						
						
						
					 
					
						2015-07-27 08:31:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8e4c69ee8c 
							
						 
					 
					
						
						
							
							* Add is_oov property, and fix up handling of attributes  
						
						
						
					 
					
						2015-07-27 01:50:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fc268f03eb 
							
						 
					 
					
						
						
							
							* Assert against null pointer exceptions in vocab  
						
						
						
					 
					
						2015-07-27 01:00:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0f093fdb30 
							
						 
					 
					
						
						
							
							* Fix get_by_orth for py3  
						
						
						
					 
					
						2015-07-26 19:26:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ceeda5a739 
							
						 
					 
					
						
						
							
							* Fix get_by_orth for py3  
						
						
						
					 
					
						2015-07-26 18:39:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6bb96c122d 
							
						 
					 
					
						
						
							
							* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects  
						
						
						
					 
					
						2015-07-26 16:37:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7eb2446082 
							
						 
					 
					
						
						
							
							* Return empty lexeme on empty string  
						
						
						
					 
					
						2015-07-26 00:18:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd525f0675 
							
						 
					 
					
						
						
							
							* Pass OOV probability around  
						
						
						
					 
					
						2015-07-25 23:29:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							22028602a9 
							
						 
					 
					
						
						
							
							* Add unicode_literals declaration in vocab.pyx  
						
						
						
					 
					
						2015-07-23 13:24:20 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7c4d72e83 
							
						 
					 
					
						
						
							
							* Add serializer property to Vocab, and lazy-load it. Add get_by_orth method.  
						
						
						
					 
					
						2015-07-23 01:18:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							109106a949 
							
						 
					 
					
						
						
							
							* Replace UniStr, using unicode objects instead  
						
						
						
					 
					
						2015-07-22 04:52:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1f7170e0e1 
							
						 
					 
					
						
						
							
							* Reinstate the fixed vocabulary --- words are only added to the lexicon in init_model, after that we create LexemeC structs with the Pool given to us.  
						
						
						
					 
					
						2015-07-20 01:37:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							317cbbc015 
							
						 
					 
					
						
						
							
							* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.  
						
						
						
					 
					
						2015-07-19 15:18:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							82d84b0f2b 
							
						 
					 
					
						
						
							
							* Index lexemes by orth, instead of a lexemes vector. Breaks the mechanism for deciding not to own LexemeC structs during parsing. Need to reinstate this.  
						
						
						
					 
					
						2015-07-18 22:42:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c2c83120d4 
							
						 
					 
					
						
						
							
							* Remove codec property from Vocab  
						
						
						
					 
					
						2015-07-17 16:40:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							db9dfd2e23 
							
						 
					 
					
						
						
							
							* Major refactor of serialization. Nearly complete now.  
						
						
						
					 
					
						2015-07-17 01:27:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a5d050134 
							
						 
					 
					
						
						
							
							* Give codec loading back to Vocab.  
						
						
						
					 
					
						2015-07-16 17:45:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b59d271510 
							
						 
					 
					
						
						
							
							* Move serialization functionality into Serializer class  
						
						
						
					 
					
						2015-07-16 11:23:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af5cc926a4 
							
						 
					 
					
						
						
							
							* Add codec property to Vocab, to use the Huffman encoding  
						
						
						
					 
					
						2015-07-13 13:55:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abc43b852d 
							
						 
					 
					
						
						
							
							* Add pos_tags attr to Vocab.  
						
						
						
					 
					
						2015-07-08 12:36:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c04e6ebca6 
							
						 
					 
					
						
						
							
							* Allow user to load different sized vectors.  
						
						
						
					 
					
						2015-06-05 16:26:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							adeb57cb1e 
							
						 
					 
					
						
						
							
							* Fix long line  
						
						
						
					 
					
						2015-06-01 23:07:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							eba7b34f66 
							
						 
					 
					
						
						
							
							* Add flag to disable loading of word vectors  
						
						
						
					 
					
						2015-05-25 01:02:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e73eaf2d05 
							
						 
					 
					
						
						
							
							* Replace some assertions with proper errors  
						
						
						
					 
					
						2015-05-08 16:52:17 +02:00 
						 
				 
			
				
					
						
							
							
								Jordan Suchow 
							
						 
					 
					
						
						
						
						
							
						
						
							3a8d9b37a6 
							
						 
					 
					
						
						
							
							Remove trailing whitespace  
						
						
						
					 
					
						2015-04-19 13:01:38 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f0e0588833 
							
						 
					 
					
						
						
							
							* Fill L2 norm attribute on LexemeC struct  
						
						
						
					 
					
						2015-02-07 08:44:42 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76d9394cb4 
							
						 
					 
					
						
						
							
							* Fix vocab.pyx for Python3  
						
						
						
					 
					
						2015-02-01 13:14:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce3ae8b5d9 
							
						 
					 
					
						
						
							
							* Fix platform-specific lexicon bug.  
						
						
						
					 
					
						2015-01-31 16:38:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d4a493855e 
							
						 
					 
					
						
						
							
							* Fix error msg  
						
						
						
					 
					
						2015-01-25 23:01:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c1c3dba4cb 
							
						 
					 
					
						
						
							
							* Check whether vector files are present before trying to load them.  
						
						
						
					 
					
						2015-01-25 18:16:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fda94271af 
							
						 
					 
					
						
						
							
							* Rename NORM1 and NORM2 attrs to lower and norm  
						
						
						
					 
					
						2015-01-24 06:17:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d460c28838 
							
						 
					 
					
						
						
							
							* Rename vec to repvec  
						
						
						
					 
					
						2015-01-22 02:06:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c7e44140b 
							
						 
					 
					
						
						
							
							* Work on word vectors, and other stuff  
						
						
						
					 
					
						2015-01-17 16:21:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7d3c40de7d 
							
						 
					 
					
						
						
							
							* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme  
						
						
						
					 
					
						2015-01-15 00:33:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0930892fc1 
							
						 
					 
					
						
						
							
							* Tmp. Working on refactor. Compiles, must hook up lexical feats.  
						
						
						
					 
					
						2015-01-14 00:03:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							46da3d74d2 
							
						 
					 
					
						
						
							
							* Tmp. Refactoring, introducing a Lexeme PyObject.  
						
						
						
					 
					
						2015-01-12 11:23:44 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce2edd6312 
							
						 
					 
					
						
						
							
							* Tmp commit. Refactoring to create a Python Lexeme class.  
						
						
						
					 
					
						2015-01-12 10:26:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a58920cc5e 
							
						 
					 
					
						
						
							
							* Import orth.word_shape as a C module  
						
						
						
					 
					
						2015-01-06 03:18:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5d41028b5 
							
						 
					 
					
						
						
							
							* Move around data files for test release  
						
						
						
					 
					
						2015-01-03 01:59:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb80937544 
							
						 
					 
					
						
						
							
							* Upd docstrings  
						
						
						
					 
					
						2014-12-27 18:45:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b8b65903fc 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2014-12-24 17:42:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							73f200436f 
							
						 
					 
					
						
						
							
							* Tests passing except for morphology/lemmatization stuff  
						
						
						
					 
					
						2014-12-23 11:40:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a89d70429 
							
						 
					 
					
						
						
							
							* Add vocab.pyx to setup, and ensure we can import spacy.en.lang  
						
						
						
					 
					
						2014-12-21 06:03:53 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1c1a4b868 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2014-12-21 05:36:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d11c1edf8c 
							
						 
					 
					
						
						
							
							* Import slice_unicode from strings.pyx  
						
						
						
					 
					
						2014-12-20 07:56:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							116f7f3bc1 
							
						 
					 
					
						
						
							
							* Rename Lexicon to Vocab, and move it to its own file  
						
						
						
					 
					
						2014-12-20 06:54:03 +11:00