Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9482d616bc 
							
						 
					 
					
						
						
							
							* Rename spans.pyx to span.pyx  
						
						
						
					 
					
						2015-11-03 23:51:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							116da5990a 
							
						 
					 
					
						
						
							
							* Clean up setting of tag in doc.from_bytes  
						
						
						
					 
					
						2015-11-03 23:48:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9ec7b9c454 
							
						 
					 
					
						
						
							
							* Clean up unused Constituent struct.  
						
						
						
					 
					
						2015-11-03 23:48:21 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1e99fcd413 
							
						 
					 
					
						
						
							
							* Rename .repvec to .vector in C API  
						
						
						
					 
					
						2015-11-03 23:47:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ee3f9ba581 
							
						 
					 
					
						
						
							
							* Fix test of serializer  
						
						
						
					 
					
						2015-11-03 19:45:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d06ba26371 
							
						 
					 
					
						
						
							
							* Fix test of serializer  
						
						
						
					 
					
						2015-11-03 19:43:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4083059650 
							
						 
					 
					
						
						
							
							Merge branch 'master' of  https://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-11-03 09:07:19 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9e37437ba8 
							
						 
					 
					
						
						
							
							* Fix assign_tag in doc.merge  
						
						
						
					 
					
						2015-11-03 19:07:02 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dde9e1357c 
							
						 
					 
					
						
						
							
							* Add todo to morphology.lemmatize  
						
						
						
					 
					
						2015-11-03 18:54:35 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ffedff9e6c 
							
						 
					 
					
						
						
							
							* Remove the archive after download, to save disk space  
						
						
						
					 
					
						2015-11-03 18:54:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85372468e3 
							
						 
					 
					
						
						
							
							* Fix serialize test  
						
						
						
					 
					
						2015-11-03 08:51:33 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							833eb35c57 
							
						 
					 
					
						
						
							
							* Fix tag assignment in doc.from_array  
						
						
						
					 
					
						2015-11-03 18:45:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							09664177d7 
							
						 
					 
					
						
						
							
							* Fix tag handling in doc.merge, and assign sent_start when setting heads.  
						
						
						
					 
					
						2015-11-03 18:15:52 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							389a373807 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-11-03 18:07:25 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3f44b3e43f 
							
						 
					 
					
						
						
							
							* Mark serializer test as requiring models  
						
						
						
					 
					
						2015-11-03 18:07:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							25ed7be8f8 
							
						 
					 
					
						
						
							
							Merge branch 'master' of  https://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-11-03 07:58:17 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							604ceac4c6 
							
						 
					 
					
						
						
							
							* Fix morphological assignment in doc.merge()  
						
						
						
					 
					
						2015-11-03 17:57:51 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5e040855a5 
							
						 
					 
					
						
						
							
							* Ensure morphological features and lemmas are loaded in from_array, re Issue  #152  
						
						
						
					 
					
						2015-11-03 17:56:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5668feb235 
							
						 
					 
					
						
						
							
							* Fix pickle test for python3  
						
						
						
					 
					
						2015-11-03 04:57:02 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6161d2529a 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-11-03 13:36:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5887506f5d 
							
						 
					 
					
						
						
							
							* Don't expect lexemes.bin in Vocab  
						
						
						
					 
					
						2015-11-03 13:23:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7dd377575 
							
						 
					 
					
						
						
							
							* Adjust conjuncts iterator in Token  
						
						
						
					 
					
						2015-11-03 13:23:22 +11:00 
						 
				 
			
				
					
						
							
							
								Andreas Grivas 
							
						 
					 
					
						
						
						
						
							
						
						
							d418f00eb1 
							
						 
					 
					
						
						
							
							fixed error when printing unicode  
						
						
						
					 
					
						2015-11-02 20:23:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							52fc338001 
							
						 
					 
					
						
						
							
							* Set is_parsed and is_tagged attrs when loading annotations into Doc, re Issue  #152  
						
						
						
					 
					
						2015-10-28 10:43:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1c0356e4c2 
							
						 
					 
					
						
						
							
							* Set test file mode to w+t  
						
						
						
					 
					
						2015-10-26 22:40:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0fe98f358b 
							
						 
					 
					
						
						
							
							* Fix mode on text file for Python3 in strings test  
						
						
						
					 
					
						2015-10-26 22:25:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8ba9cf905e 
							
						 
					 
					
						
						
							
							* Fix mode on text file for Python3 in strings test  
						
						
						
					 
					
						2015-10-26 21:44:34 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a0730699b1 
							
						 
					 
					
						
						
							
							* Fix mode on text file for Python3 in strings test  
						
						
						
					 
					
						2015-10-26 21:25:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							725344d349 
							
						 
					 
					
						
						
							
							* Fix tempfile in test  
						
						
						
					 
					
						2015-10-26 21:08:18 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f11030aadc 
							
						 
					 
					
						
						
							
							* Remove out-dated TODO comment  
						
						
						
					 
					
						2015-10-26 12:33:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a371a1071d 
							
						 
					 
					
						
						
							
							* Save and load word vectors during pickling, re Issue  #125  
						
						
						
					 
					
						2015-10-26 12:33:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a824a98312 
							
						 
					 
					
						
						
							
							* Add tests for pickling vectors, re: Issue  #125  
						
						
						
					 
					
						2015-10-26 12:31:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							314090cc78 
							
						 
					 
					
						
						
							
							* Set vectors length when unpickling vocab, re Issue  #125  
						
						
						
					 
					
						2015-10-26 12:05:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e16f9e435 
							
						 
					 
					
						
						
							
							* Move tests underneath spacy/  
						
						
						
					 
					
						2015-10-26 00:07:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3a6e48e814 
							
						 
					 
					
						
						
							
							Merge pull request  #149  from chrisdubois/pickle-patch  
						
						... 
						
						
						
						Add __reduce__ to Tokenizer so that English pickles. 
						
					 
					
						2015-10-25 15:30:31 +11:00 
						 
				 
			
				
					
						
							
							
								Chris DuBois 
							
						 
					 
					
						
						
						
						
							
						
						
							dac8fe7bdb 
							
						 
					 
					
						
						
							
							Add __reduce__ to Tokenizer so that English pickles.  
						
						... 
						
						
						
						- Add tests to test_pickle and test_tokenizer that save to tempfiles. 
						
					 
					
						2015-10-23 22:24:03 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff4fe524ee 
							
						 
					 
					
						
						
							
							* Fix exception for python 2  
						
						
						
					 
					
						2015-10-23 01:56:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							341a3e85cd 
							
						 
					 
					
						
						
							
							* Upd downloaded data version  
						
						
						
					 
					
						2015-10-23 00:56:57 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f18fd8c659 
							
						 
					 
					
						
						
							
							* Fix language.py for change in StringStore load API  
						
						
						
					 
					
						2015-10-23 03:48:12 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							23855db3ca 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy into develop  
						
						
						
					 
					
						2015-10-23 03:46:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4f13849065 
							
						 
					 
					
						
						
							
							Merge pull request  #145  from henningpeters/master  
						
						... 
						
						
						
						better error reporting, cleanup 
						
					 
					
						2015-10-23 03:45:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3be94be0c0 
							
						 
					 
					
						
						
							
							Merge pull request  #148  from maxirmx/master  
						
						... 
						
						
						
						Utf8 encoding for lemma_rules.json 
						
					 
					
						2015-10-22 21:46:28 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c86bda8d1a 
							
						 
					 
					
						
						
							
							* Fix import of uget  
						
						
						
					 
					
						2015-10-22 21:13:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2348a08481 
							
						 
					 
					
						
						
							
							* Load/dump strings with a json file, instead of the hacky strings file we were using.  
						
						
						
					 
					
						2015-10-22 21:13:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9baf0abd59 
							
						 
					 
					
						
						
							
							* Save vocab after training.  
						
						
						
					 
					
						2015-10-22 21:09:14 +11:00 
						 
				 
			
				
					
						
							
							
								maxirmx 
							
						 
					 
					
						
						
						
						
							
						
						
							f07e4accd7 
							
						 
					 
					
						
						
							
							Fixing encoding issue  #4  
						
						
						
					 
					
						2015-10-21 20:45:56 +03:00 
						 
				 
			
				
					
						
							
							
								maxirmx 
							
						 
					 
					
						
						
						
						
							
						
						
							fcbfff043f 
							
						 
					 
					
						
						
							
							Fixing encoding issue  #3  
						
						
						
					 
					
						2015-10-21 15:52:34 +03:00 
						 
				 
			
				
					
						
							
							
								maxirmx 
							
						 
					 
					
						
						
						
						
							
						
						
							fe9d2e2c4e 
							
						 
					 
					
						
						
							
							Fixing encode issue  #2  
						
						
						
					 
					
						2015-10-21 15:36:21 +03:00 
						 
				 
			
				
					
						
							
							
								maxirmx 
							
						 
					 
					
						
						
						
						
							
						
						
							e4a1726f77 
							
						 
					 
					
						
						
							
							Fixing encoding issue  
						
						... 
						
						
						
						UTF-8 
						
					 
					
						2015-10-21 14:16:37 +03:00 
						 
				 
			
				
					
						
							
							
								Andreas Grivas 
							
						 
					 
					
						
						
						
						
							
						
						
							93ada458e2 
							
						 
					 
					
						
						
							
							added __repr__ that prints text in ipython for doc, token, and span objects  
						
						
						
					 
					
						2015-10-21 14:11:46 +03:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							ccffd2ef53 
							
						 
					 
					
						
						
							
							fixed extract directory  
						
						
						
					 
					
						2015-10-21 07:59:34 +02:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							da4c9cee06 
							
						 
					 
					
						
						
							
							assert filename match  
						
						
						
					 
					
						2015-10-20 19:33:59 +02:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							4f703f0cb4 
							
						 
					 
					
						
						
							
							better error reporting, cleanup  
						
						
						
					 
					
						2015-10-20 19:11:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9cdea6e450 
							
						 
					 
					
						
						
							
							* Import uget correctly  
						
						
						
					 
					
						2015-10-19 08:32:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6727a46bb5 
							
						 
					 
					
						
						
							
							* Fix Issue  #118 : Matcher behaves unpredictably when matches overlap.  
						
						
						
					 
					
						2015-10-19 16:45:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							135062d23c 
							
						 
					 
					
						
						
							
							* Fix error with merged text when merged region did not have trailing whitespace  
						
						
						
					 
					
						2015-10-19 15:47:04 +11:00 
						 
				 
			
				
					
						
							
							
								Henning Peters 
							
						 
					 
					
						
						
						
						
							
						
						
							bfde91fa49 
							
						 
					 
					
						
						
							
							add custom download tool (uget), replace wget with uget  
						
						
						
					 
					
						2015-10-18 12:35:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9839cd2c0b 
							
						 
					 
					
						
						
							
							* Fix whitespace_ calculation in Token  
						
						
						
					 
					
						2015-10-18 17:21:11 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c99285b8b9 
							
						 
					 
					
						
						
							
							* Clean up C++ usage in spacy/matcher.pyx  
						
						
						
					 
					
						2015-10-18 17:20:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7e6c5ac8f 
							
						 
					 
					
						
						
							
							* Fix Issue  #122 : Incorrect calculation of children after Doc.merge()  
						
						
						
					 
					
						2015-10-18 17:17:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3ba66f2dc7 
							
						 
					 
					
						
						
							
							* Add string length cap in Tokenizer.__call__  
						
						
						
					 
					
						2015-10-16 04:54:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6e0f985afc 
							
						 
					 
					
						
						
							
							* Fix token.conjuncts  
						
						
						
					 
					
						2015-10-15 03:49:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e0104ac81 
							
						 
					 
					
						
						
							
							* Fix token.conjuncts  
						
						
						
					 
					
						2015-10-15 03:47:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b8f3345a82 
							
						 
					 
					
						
						
							
							* Fix token.conjuncts method  
						
						
						
					 
					
						2015-10-15 03:36:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							23818f89b8 
							
						 
					 
					
						
						
							
							* Fix token.conjuncts method  
						
						
						
					 
					
						2015-10-15 03:34:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7a15d1b60c 
							
						 
					 
					
						
						
							
							* Add Python 2/3 compatibility fix for copy_reg  
						
						
						
					 
					
						2015-10-13 20:04:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							329ae57520 
							
						 
					 
					
						
						
							
							* Fix whitespace attachment thing  
						
						
						
					 
					
						2015-10-13 09:46:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							37919eac82 
							
						 
					 
					
						
						
							
							* Fix whitespace attachment in simpler way. Leaves problem with setting left/right children.  
						
						
						
					 
					
						2015-10-13 18:23:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c70eb776ae 
							
						 
					 
					
						
						
							
							* Fix whitespace attachment, so that left/right children are consistent with head.  
						
						
						
					 
					
						2015-10-13 15:58:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							531182f937 
							
						 
					 
					
						
						
							
							* Fix Model.__reduce__  
						
						
						
					 
					
						2015-10-13 15:14:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c227a6c1f 
							
						 
					 
					
						
						
							
							* Fix Model.__reduce__  
						
						
						
					 
					
						2015-10-13 15:10:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							358c82595c 
							
						 
					 
					
						
						
							
							* Fix NAMES list in spacy/parts_of_speech.pyx  
						
						
						
					 
					
						2015-10-13 14:18:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c1fdc487bc 
							
						 
					 
					
						
						
							
							Merge branch 'attrs'  
						
						
						
					 
					
						2015-10-13 14:03:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e886e6a406 
							
						 
					 
					
						
						
							
							* Inc version  
						
						
						
					 
					
						2015-10-13 13:46:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							20fd36a0f7 
							
						 
					 
					
						
						
							
							* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue  #125 : allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve.  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8de403483 
							
						 
					 
					
						
						
							
							* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue  #125  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85e7944572 
							
						 
					 
					
						
						
							
							* Start trying to pickle Vocab  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5ca57bd859 
							
						 
					 
					
						
						
							
							* Ensure Morphology can be pickled, to address Issue  #125 .  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0cee928467 
							
						 
					 
					
						
						
							
							* Allow StringStore to be pickled, to start addressing Issue  #125  
						
						
						
					 
					
						2015-10-13 13:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							41012907a8 
							
						 
					 
					
						
						
							
							* Fix variable name  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e70368d157 
							
						 
					 
					
						
						
							
							* Use lower case strings for dependency label names in symbols enum  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7b4af3d1e7 
							
						 
					 
					
						
						
							
							* Fix parts_of_speech now that symbols list has been reformed  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							37b909b6b6 
							
						 
					 
					
						
						
							
							* Use the symbols file in vocab instead of the symbols subfiles like attrs.pxd  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce65ec698c 
							
						 
					 
					
						
						
							
							* Remove qualified naming in symbols  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f4be0adcd 
							
						 
					 
					
						
						
							
							* Map NO_TAG to NIL in parts_of_speech.pxd  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							278e12f7e8 
							
						 
					 
					
						
						
							
							* Addmorphology symbols to morphology. May need to remove these as an enum.  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d80067eda1 
							
						 
					 
					
						
						
							
							* Map empty string to NULL_ATTR in attrs  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d70e8cac2c 
							
						 
					 
					
						
						
							
							* Fix empty values in attributes and parts of speech, so symbols align correctly with the StringStore  
						
						
						
					 
					
						2015-10-13 13:44:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a29c8ee23d 
							
						 
					 
					
						
						
							
							* Add symbols to the vocab before reading the strings, so that they line up correctly  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							74c0853471 
							
						 
					 
					
						
						
							
							* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							10a4a843ea 
							
						 
					 
					
						
						
							
							* Enumerate all symbols in one file  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85ce36ab11 
							
						 
					 
					
						
						
							
							* Refactor symbols, so that frequency rank can be derived from the orth id of a word.  
						
						
						
					 
					
						2015-10-13 13:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dfbcff2ff1 
							
						 
					 
					
						
						
							
							* Revert codecs/io change to strings.pyx, as it seemed to cause an error? Will investigate.  
						
						
						
					 
					
						2015-10-10 15:54:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9dd2f25c74 
							
						 
					 
					
						
						
							
							* Fix Issue  #131 : Force whitespace characters to attach syntactically to previous token, and ensure they cannot serve as stand-alone 'sentence' units.  
						
						
						
					 
					
						2015-10-10 15:53:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8b39feefbe 
							
						 
					 
					
						
						
							
							* Add dependency post-process rule to ensure spaces are attached to neighbouring tokens, so that they can't be sentence boundaries  
						
						
						
					 
					
						2015-10-10 15:32:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2153067958 
							
						 
					 
					
						
						
							
							* Fix use of io in strings.pyx  
						
						
						
					 
					
						2015-10-10 15:03:12 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ec874247b5 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-10-10 14:23:51 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							30de4135c9 
							
						 
					 
					
						
						
							
							* Fix merge problem  
						
						
						
					 
					
						2015-10-10 14:22:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dc393a5f1d 
							
						 
					 
					
						
						
							
							Merge pull request  #126  from tomtung/master  
						
						... 
						
						
						
						Improve slicing support for both Doc and Span 
						
					 
					
						2015-10-10 14:14:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							83dccf0fd7 
							
						 
					 
					
						
						
							
							* Use io module insteads of deprecated codecs module  
						
						
						
					 
					
						2015-10-10 14:13:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a3dfe2b901 
							
						 
					 
					
						
						
							
							* Increment data version  
						
						
						
					 
					
						2015-10-09 13:26:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2d9e5bf566 
							
						 
					 
					
						
						
							
							* Allow punctuation to be lemmatized  
						
						
						
					 
					
						2015-10-09 19:02:42 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5332c0b697 
							
						 
					 
					
						
						
							
							* Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue  #130  
						
						
						
					 
					
						2015-10-09 18:54:40 +11:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							9a6811acc4 
							
						 
					 
					
						
						
							
							Merge remote-tracking branch 'upstream/master'  
						
						
						
					 
					
						2015-10-08 22:53:02 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b125289f30 
							
						 
					 
					
						
						
							
							* Fix type declaration in asciied function  
						
						
						
					 
					
						2015-10-09 13:46:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							801d55a6d9 
							
						 
					 
					
						
						
							
							* Fix phrase matcher  
						
						
						
					 
					
						2015-10-09 02:00:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b3a70e6375 
							
						 
					 
					
						
						
							
							* Clean up unnecessary try/except block  
						
						
						
					 
					
						2015-10-08 14:34:11 +11:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							0f601b8b75 
							
						 
					 
					
						
						
							
							Update docstring of Doc.__getitem__  
						
						
						
					 
					
						2015-10-07 01:27:28 -07:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							3fd3bc79aa 
							
						 
					 
					
						
						
							
							Refactor to remove duplicate slicing logic  
						
						
						
					 
					
						2015-10-07 01:25:35 -07:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							97685aecb7 
							
						 
					 
					
						
						
							
							Add slicing support to Span  
						
						
						
					 
					
						2015-10-06 02:45:49 -07:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							ef2af20cd3 
							
						 
					 
					
						
						
							
							Make Doc's slicing behavior conform to Python conventions  
						
						
						
					 
					
						2015-10-06 02:41:28 -07:00 
						 
				 
			
				
					
						
							
							
								Yubing (Tom) Dong 
							
						 
					 
					
						
						
						
						
							
						
						
							2fc33e8024 
							
						 
					 
					
						
						
							
							Allow step=1 when slicing a Doc  
						
						
						
					 
					
						2015-10-06 00:57:05 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b228a8f4a6 
							
						 
					 
					
						
						
							
							* Remove spacy/en/attrs  
						
						
						
					 
					
						2015-10-06 16:20:46 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							693677fd8d 
							
						 
					 
					
						
						
							
							* Prepare to remove en/attrx file, now that moving to symbols.pyx  
						
						
						
					 
					
						2015-10-06 16:20:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d9f41c2c9 
							
						 
					 
					
						
						
							
							* Add LookupError for better error reporting in Vocab  
						
						
						
					 
					
						2015-10-06 10:34:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ecc5281b36 
							
						 
					 
					
						
						
							
							* Remove en/pos.pyx, as the tagger code now lives in spacy/tagger.pyx  
						
						
						
					 
					
						2015-10-06 10:12:08 +11:00 
						 
				 
			
				
					
						
							
							
								alvations 
							
						 
					 
					
						
						
						
						
							
						
						
							8caedba42a 
							
						 
					 
					
						
						
							
							caught more codecs.open -> io.open  
						
						
						
					 
					
						2015-09-30 20:20:09 +02:00 
						 
				 
			
				
					
						
							
							
								alvations 
							
						 
					 
					
						
						
						
						
							
						
						
							8199012d26 
							
						 
					 
					
						
						
							
							changing deprecated codecs.open to io.open =)  
						
						
						
					 
					
						2015-09-30 20:10:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							87e6186828 
							
						 
					 
					
						
						
							
							* Rename _seq to doc attribute in Span  
						
						
						
					 
					
						2015-09-29 23:03:55 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab694b0364 
							
						 
					 
					
						
						
							
							* Fix open-bounded slice indices.  
						
						
						
					 
					
						2015-09-29 23:03:09 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a6ced80c0c 
							
						 
					 
					
						
						
							
							* Fix Issue  #116 : Misleading handling of True value in Language.__init__.  
						
						
						
					 
					
						2015-09-29 20:54:12 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f9d2a5b651 
							
						 
					 
					
						
						
							
							* Fix issue  #112 : Replace unidecode with text-unidecode, to avoid license problems.  
						
						
						
					 
					
						2015-09-28 23:40:18 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2c33a96ac3 
							
						 
					 
					
						
						
							
							Merge pull request  #99  from rw/patch-1  
						
						... 
						
						
						
						Force SSL for downloading English language data. 
						
					 
					
						2015-09-28 17:46:26 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abf0d930af 
							
						 
					 
					
						
						
							
							* Fix API for loading word vectors from a file.  
						
						
						
					 
					
						2015-09-23 23:51:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5c256745b 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-09-22 12:26:24 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							528e26a506 
							
						 
					 
					
						
						
							
							* Add rule to ensure ordinals are preserved as single tokens  
						
						
						
					 
					
						2015-09-22 12:26:05 +10:00 
						 
				 
			
				
					
						
							
							
								Robert 
							
						 
					 
					
						
						
						
						
							
						
						
							8711b64860 
							
						 
					 
					
						
						
							
							Force SSL for downloading English language data.  
						
						... 
						
						
						
						It would also be nice to have a checksum for this. 
						
					 
					
						2015-09-21 17:26:01 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7283a5067 
							
						 
					 
					
						
						
							
							* Fix vectors bugs for OOV words  
						
						
						
					 
					
						2015-09-22 02:10:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							44aecba701 
							
						 
					 
					
						
						
							
							* Fix Token.has_vector and Lexeme.has_vector  
						
						
						
					 
					
						2015-09-22 01:43:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							596fde8daa 
							
						 
					 
					
						
						
							
							* Add has_vector attribute to Token and Lexeme  
						
						
						
					 
					
						2015-09-21 19:52:43 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f32927efbf 
							
						 
					 
					
						
						
							
							* Raise exceptions if attempt to access parse, but data is not installed. This partly but not fully addresses Issue  #97 . Still need exceptions on the various Token attributes that access the parse tree, e.g. token.head, token.lefts, token.rights, etc. Exceptions should be centralized, too.  
						
						
						
					 
					
						2015-09-21 18:35:40 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							388062ae01 
							
						 
					 
					
						
						
							
							* Fix repvec_length problem  
						
						
						
					 
					
						2015-09-21 18:10:51 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ac459278d1 
							
						 
					 
					
						
						
							
							* Fix vector length error reporting, and ensure vec_len is returned  
						
						
						
					 
					
						2015-09-21 18:08:32 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ba4e563701 
							
						 
					 
					
						
						
							
							* Ensure vectors are same length, and return vector length in load_vectors_bz2  
						
						
						
					 
					
						2015-09-21 18:03:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d00fe2bbc6 
							
						 
					 
					
						
						
							
							* Don't allow Span objects to be written to, as it introduces subtle bugs because they're created afresh from Doc.sents, Doc.ents etc.  
						
						
						
					 
					
						2015-09-21 17:59:39 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6945bf880 
							
						 
					 
					
						
						
							
							* Add way to load vectors from bz2 file to vocab  
						
						
						
					 
					
						2015-09-17 12:58:23 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							77856c4fcd 
							
						 
					 
					
						
						
							
							* Try giving Doc and Span objects vector and vector_norm attributes, and .similarity functions. Turns out to be bad idea.  
						
						
						
					 
					
						2015-09-17 11:50:11 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							191d593e03 
							
						 
					 
					
						
						
							
							* Fix vectors bug in lexeme  
						
						
						
					 
					
						2015-09-15 19:05:11 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d87519f64 
							
						 
					 
					
						
						
							
							* Remove vectors argument from Vocab object  
						
						
						
					 
					
						2015-09-15 14:47:14 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							362526b592 
							
						 
					 
					
						
						
							
							* Rename vectors_length attribute  
						
						
						
					 
					
						2015-09-15 14:43:31 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							60c26b2dfa 
							
						 
					 
					
						
						
							
							* Fix slicing when start or stop is None  
						
						
						
					 
					
						2015-09-15 14:43:10 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7ac6cacc26 
							
						 
					 
					
						
						
							
							* Remove const qualifier on LexemeC.repvec  
						
						
						
					 
					
						2015-09-15 14:42:51 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dd4d64b235 
							
						 
					 
					
						
						
							
							* Support setting of word vectors on Lexeme object.  
						
						
						
					 
					
						2015-09-15 14:42:27 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							27f988b167 
							
						 
					 
					
						
						
							
							* Remove the vectors option to Vocab, preferring to either load vectors from disk, or set them on the Lexeme objects.  
						
						
						
					 
					
						2015-09-15 14:41:48 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							193f127f81 
							
						 
					 
					
						
						
							
							* Fix ugly py_check_flag and py_set_flag functions in Lexeme  
						
						
						
					 
					
						2015-09-15 13:06:18 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9561d88529 
							
						 
					 
					
						
						
							
							* Add is_stop to Python API  
						
						
						
					 
					
						2015-09-14 18:25:40 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							65dc0d1dfb 
							
						 
					 
					
						
						
							
							* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.  
						
						
						
					 
					
						2015-09-14 17:49:58 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e13e47e9e5 
							
						 
					 
					
						
						
							
							* Add English stop words  
						
						
						
					 
					
						2015-09-14 17:48:51 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							24ed3fc25c 
							
						 
					 
					
						
						
							
							* Check file existance before opening in lemmatizer  
						
						
						
					 
					
						2015-09-13 10:45:21 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dbb48ce49e 
							
						 
					 
					
						
						
							
							* Delete extra wordnets  
						
						
						
					 
					
						2015-09-13 10:31:37 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e9c59693ea 
							
						 
					 
					
						
						
							
							* Remove assertion from vocab.pyx  
						
						
						
					 
					
						2015-09-13 10:30:08 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c08f10083c 
							
						 
					 
					
						
						
							
							* Add test and test_with_ws attributes.  
						
						
						
					 
					
						2015-09-13 10:27:42 +10:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0b7d2a6c62 
							
						 
					 
					
						
						
							
							* Inc version  
						
						
						
					 
					
						2015-09-13 01:26:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1dfaeed8a 
							
						 
					 
					
						
						
							
							* Check serializer freqs exist before loading  
						
						
						
					 
					
						2015-09-12 23:49:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a412c66c8c 
							
						 
					 
					
						
						
							
							* Check serializer freqs exist before loading  
						
						
						
					 
					
						2015-09-12 23:40:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							631c843ed1 
							
						 
					 
					
						
						
							
							* Don't look for index.adv in le,matizer  
						
						
						
					 
					
						2015-09-12 06:03:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dfdd4f2d60 
							
						 
					 
					
						
						
							
							Merge branch 'develop' of  https://github.com/honnibal/spaCy  into develop  
						
						
						
					 
					
						2015-09-10 15:23:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e285ca7d6c 
							
						 
					 
					
						
						
							
							* Load serializer freqs in vocab  
						
						
						
					 
					
						2015-09-10 15:22:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7fdcce1f9 
							
						 
					 
					
						
						
							
							Merge branch 'develop' of  https://github.com/honnibal/spaCy  into develop  
						
						
						
					 
					
						2015-09-10 14:52:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85c3fec1d1 
							
						 
					 
					
						
						
							
							* Fix morphology loading  
						
						
						
					 
					
						2015-09-10 14:52:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7c660c5efc 
							
						 
					 
					
						
						
							
							* Use dict.get in lemmatizer  
						
						
						
					 
					
						2015-09-10 14:51:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							094440f9f5 
							
						 
					 
					
						
						
							
							Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop  
						
						
						
					 
					
						2015-09-10 14:51:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c3f773cd63 
							
						 
					 
					
						
						
							
							* Fix Lexeme.check_flag  
						
						
						
					 
					
						2015-09-10 14:51:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							90da3a695d 
							
						 
					 
					
						
						
							
							* Load lemmatizer from disk in Vocab.from_dir  
						
						
						
					 
					
						2015-09-10 14:49:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e7e529edf4 
							
						 
					 
					
						
						
							
							* Fix Lexeme.check_flag  
						
						
						
					 
					
						2015-09-10 14:45:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9e7bfe8449 
							
						 
					 
					
						
						
							
							* Fix space at end of merged token  
						
						
						
					 
					
						2015-09-10 14:45:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f634191e27 
							
						 
					 
					
						
						
							
							* Fix vocab read/write  
						
						
						
					 
					
						2015-09-10 14:44:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							31ccf494e6 
							
						 
					 
					
						
						
							
							Merge branch 'develop' of  https://github.com/honnibal/spaCy  into develop  
						
						
						
					 
					
						2015-09-09 14:33:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7f4b26c8c 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2015-09-09 14:33:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							07686470a9 
							
						 
					 
					
						
						
							
							* Don't consider a coordinated NP a base chunk  
						
						
						
					 
					
						2015-09-09 14:32:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d9f1fc2112 
							
						 
					 
					
						
						
							
							* Add deprecation warning for unused load_vectors argument.  
						
						
						
					 
					
						2015-09-09 14:31:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0b527fbdc8 
							
						 
					 
					
						
						
							
							* Set POS tag in morphology  
						
						
						
					 
					
						2015-09-09 14:30:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							07c09a0e1b 
							
						 
					 
					
						
						
							
							* Fix attribute getters and setters in Lexeme  
						
						
						
					 
					
						2015-09-09 14:29:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6561988cf 
							
						 
					 
					
						
						
							
							* Fix lexemes.bin  
						
						
						
					 
					
						2015-09-09 11:49:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c301bebd33 
							
						 
					 
					
						
						
							
							Merge branch 'master' of  https://github.com/honnibal/spaCy  into develop  
						
						
						
					 
					
						2015-09-09 10:55:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0e24d099a1 
							
						 
					 
					
						
						
							
							* Fix L/R edge bug, by ensuring l_edge and r_edge are preset, and fixing the way the edge update in del_arc. Bugs keep arising here because the edges are absolute positions, where everything else is relative. I'm also not 100% convinced that del_arc is handled correctly. Do we need to update the parents?  
						
						
						
					 
					
						2015-09-09 03:40:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2be3620333 
							
						 
					 
					
						
						
							
							* Save morphological analyses in a cache  
						
						
						
					 
					
						2015-09-08 15:39:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1def5a6cbe 
							
						 
					 
					
						
						
							
							* Fix print statements in matcher  
						
						
						
					 
					
						2015-09-08 15:38:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							64d71f8893 
							
						 
					 
					
						
						
							
							* Fix lemmatizer  
						
						
						
					 
					
						2015-09-08 15:38:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							623329b19a 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy into develop  
						
						
						
					 
					
						2015-09-08 14:27:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							62a01dd41d 
							
						 
					 
					
						
						
							
							* Fix issue  #92 : lexemes.bin read error on 32-bit platforms.  
						
						
						
					 
					
						2015-09-08 14:23:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ef58607a99 
							
						 
					 
					
						
						
							
							* Add spacy.it  
						
						
						
					 
					
						2015-09-06 22:10:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2154a54f6b 
							
						 
					 
					
						
						
							
							* Add spacy.de  
						
						
						
					 
					
						2015-09-06 21:56:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f6ec5bf1b0 
							
						 
					 
					
						
						
							
							* Use empty tag map in vocab if none supplied  
						
						
						
					 
					
						2015-09-06 20:19:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4f8e38271d 
							
						 
					 
					
						
						
							
							* Fix merge errors in lexeme.pxd  
						
						
						
					 
					
						2015-09-06 20:19:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							86c888667f 
							
						 
					 
					
						
						
							
							* Merge in changes from de branch  
						
						
						
					 
					
						2015-09-06 19:49:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d2fc104a26 
							
						 
					 
					
						
						
							
							* Begin merge of Gazetteer and DE branches  
						
						
						
					 
					
						2015-09-06 19:45:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dbf8dce109 
							
						 
					 
					
						
						
							
							Merge branch 'gaz' of ssh://github.com/honnibal/spaCy into gaz  
						
						
						
					 
					
						2015-09-06 18:44:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9eae9837c4 
							
						 
					 
					
						
						
							
							* Fix morphology look up  
						
						
						
					 
					
						2015-09-06 17:53:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6427a3fcac 
							
						 
					 
					
						
						
							
							* Temporarily import flag attributes in matcher  
						
						
						
					 
					
						2015-09-06 17:53:12 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7cc56ada6e 
							
						 
					 
					
						
						
							
							* Temporarily add py_set_flag attribute in Lexeme  
						
						
						
					 
					
						2015-09-06 17:52:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e35bb36be7 
							
						 
					 
					
						
						
							
							* Ensure Lexeme.check_flag returns a boolean value  
						
						
						
					 
					
						2015-09-06 17:52:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7e4fea67d3 
							
						 
					 
					
						
						
							
							* Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods.  
						
						
						
					 
					
						2015-09-06 10:48:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5edac11225 
							
						 
					 
					
						
						
							
							* Wrap self.parse in nogil, and break if an invalid move is predicted. The invalid break is a work-around that papers over likely bugs, but we can't easily break in the nogil block, and otherwise we'll get an infinite loop. Need to set this as an error flag.  
						
						
						
					 
					
						2015-09-06 04:15:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd1eeb3102 
							
						 
					 
					
						
						
							
							* Add POS attribute support in get_attr  
						
						
						
					 
					
						2015-09-06 04:13:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							534e3dda3c 
							
						 
					 
					
						
						
							
							* More work on language independent parsing  
						
						
						
					 
					
						2015-08-28 03:44:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c2307fa9ee 
							
						 
					 
					
						
						
							
							* More work on language-generic parsing  
						
						
						
					 
					
						2015-08-28 02:02:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							86c4a8e3e2 
							
						 
					 
					
						
						
							
							* Work on new morphology organization  
						
						
						
					 
					
						2015-08-27 23:11:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b89e2454c 
							
						 
					 
					
						
						
							
							* Improve error-reporting in tagger  
						
						
						
					 
					
						2015-08-27 10:26:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f0a7c99554 
							
						 
					 
					
						
						
							
							* Relax rule-requirement in lemmatizer  
						
						
						
					 
					
						2015-08-27 10:26:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0af139e183 
							
						 
					 
					
						
						
							
							* Tagger training now working. Still need to test load/save of model. Morphology still broken.  
						
						
						
					 
					
						2015-08-27 09:16:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1302d35dff 
							
						 
					 
					
						
						
							
							* Rework interfaces in vocab  
						
						
						
					 
					
						2015-08-26 19:21:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2d521768a3 
							
						 
					 
					
						
						
							
							* Store Morphology class in Vocab  
						
						
						
					 
					
						2015-08-26 19:21:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d30029979e 
							
						 
					 
					
						
						
							
							* Avoid import of morphology in spans  
						
						
						
					 
					
						2015-08-26 19:20:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							119c0f8c3f 
							
						 
					 
					
						
						
							
							* Hack out morphology stuff from tokenizer, while morphology being reimplemented.  
						
						
						
					 
					
						2015-08-26 19:20:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b4faf551f5 
							
						 
					 
					
						
						
							
							* Refactor language-independent tagger class  
						
						
						
					 
					
						2015-08-26 19:19:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a3d5e6c0dd 
							
						 
					 
					
						
						
							
							* Reform constructor and save/load workflow in parser model  
						
						
						
					 
					
						2015-08-26 19:19:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1d7f2d3abc 
							
						 
					 
					
						
						
							
							* Hack on morphology structs  
						
						
						
					 
					
						2015-08-26 19:18:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8f2f4e545 
							
						 
					 
					
						
						
							
							* Temporarily add PUNC name to parts_of_specch dictionary, until better solution  
						
						
						
					 
					
						2015-08-26 19:18:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							008b02b035 
							
						 
					 
					
						
						
							
							* Comment out enums in Morpohlogy for now  
						
						
						
					 
					
						2015-08-26 19:17:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							378729f81a 
							
						 
					 
					
						
						
							
							* Hack Morphology class towards usability  
						
						
						
					 
					
						2015-08-26 19:17:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							430affc347 
							
						 
					 
					
						
						
							
							* Fix missing n_patterns property in Matcher class. Fix from_dir method  
						
						
						
					 
					
						2015-08-26 19:17:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3acf60df06 
							
						 
					 
					
						
						
							
							* Add missing properties in Lexeme class  
						
						
						
					 
					
						2015-08-26 19:16:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76996f4145 
							
						 
					 
					
						
						
							
							* Hack on generic Language class. Still needs work for morphology, defaults, etc  
						
						
						
					 
					
						2015-08-26 19:16:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e2ef78b29c 
							
						 
					 
					
						
						
							
							* Gut pos.pyx module, since functionality moved to spacy/tagger.pyx  
						
						
						
					 
					
						2015-08-26 19:15:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c4d8754385 
							
						 
					 
					
						
						
							
							* Specify LOCAL_DATA_DIR global in spacy.en.__init__.py  
						
						
						
					 
					
						2015-08-26 19:15:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c2d8edd0bd 
							
						 
					 
					
						
						
							
							* Add PROB attribute in attrs.pxd  
						
						
						
					 
					
						2015-08-26 19:14:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c5a27d1821 
							
						 
					 
					
						
						
							
							* Move lemmatizer to spacy  
						
						
						
					 
					
						2015-08-25 15:47:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							82217c6ec6 
							
						 
					 
					
						
						
							
							* Generalize lemmatizer  
						
						
						
					 
					
						2015-08-25 15:46:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8083a07c3e 
							
						 
					 
					
						
						
							
							* Use language base class  
						
						
						
					 
					
						2015-08-25 15:37:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f2f699ac18 
							
						 
					 
					
						
						
							
							* Add language base class  
						
						
						
					 
					
						2015-08-25 15:37:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5dd76be446 
							
						 
					 
					
						
						
							
							* Split EnPosTagger up into base class and subclass  
						
						
						
					 
					
						2015-08-24 05:25:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5d5922dbfa 
							
						 
					 
					
						
						
							
							* Begin laying out morphological features  
						
						
						
					 
					
						2015-08-24 01:04:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6f1743692a 
							
						 
					 
					
						
						
							
							* Work on language-independent refactoring  
						
						
						
					 
					
						2015-08-23 20:49:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3879d28457 
							
						 
					 
					
						
						
							
							* Fix https for url detection  
						
						
						
					 
					
						2015-08-23 02:40:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cad0cca4e3 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2015-08-22 22:04:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bf38b3b883 
							
						 
					 
					
						
						
							
							* Hack on l/r reversal bug  
						
						
						
					 
					
						2015-08-10 05:58:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6116413b47 
							
						 
					 
					
						
						
							
							* Fix label prediction in StepwiseState  
						
						
						
					 
					
						2015-08-10 05:05:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2c9753eff2 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-08-10 00:09:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9de98f5a6f 
							
						 
					 
					
						
						
							
							* Add Parser.stepthrough method, with context manager  
						
						
						
					 
					
						2015-08-10 00:08:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fe43f8cf39 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-08-09 02:31:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9c090945e0 
							
						 
					 
					
						
						
							
							* Add Parser.predict method, and clean up Parser.get_state  
						
						
						
					 
					
						2015-08-09 02:29:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							04fccfb984 
							
						 
					 
					
						
						
							
							* Fix get_state for parser prediction  
						
						
						
					 
					
						2015-08-09 02:11:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							55fde0e240 
							
						 
					 
					
						
						
							
							* Fix get_state  
						
						
						
					 
					
						2015-08-09 01:45:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f0f4fa9838 
							
						 
					 
					
						
						
							
							* Fix Parser.get_state  
						
						
						
					 
					
						2015-08-09 01:40:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							18331dca89 
							
						 
					 
					
						
						
							
							* Add continue_for argument to parser 'partial' function, which is now renamed to get_state  
						
						
						
					 
					
						2015-08-09 01:31:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0653288fa5 
							
						 
					 
					
						
						
							
							* Fix stateclass.queue  
						
						
						
					 
					
						2015-08-09 00:39:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9de218b7ba 
							
						 
					 
					
						
						
							
							* Fix Parser.partial function  
						
						
						
					 
					
						2015-08-08 23:45:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01be34d55a 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-08-08 23:37:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cc9deae960 
							
						 
					 
					
						
						
							
							* Add is_valid method to transition_system  
						
						
						
					 
					
						2015-08-08 23:36:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a46c77324 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-08-08 23:35:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7bafc789e7 
							
						 
					 
					
						
						
							
							* Add stack and queue properties to stateclass, for python access  
						
						
						
					 
					
						2015-08-08 23:32:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3af938365f 
							
						 
					 
					
						
						
							
							* Add function partial to Parser  
						
						
						
					 
					
						2015-08-08 23:32:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76a1f0481a 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-08-08 23:31:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b0f5c39084 
							
						 
					 
					
						
						
							
							* Fix handling of exclusion entities  
						
						
						
					 
					
						2015-08-06 17:28:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f65879991 
							
						 
					 
					
						
						
							
							* Fix shape attr bug, and fix handling of false positive matches  
						
						
						
					 
					
						2015-08-06 17:28:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							10d869d102 
							
						 
					 
					
						
						
							
							* Don't allow conjunction between NPs in base NP chunks  
						
						
						
					 
					
						2015-08-06 16:31:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							383dfabd67 
							
						 
					 
					
						
						
							
							* Fix matcher setting of entities  
						
						
						
					 
					
						2015-08-06 16:27:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							59c3bf60a6 
							
						 
					 
					
						
						
							
							* Ensure entity recognizer doesn't over-write preset types  
						
						
						
					 
					
						2015-08-06 16:09:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cd7d1682cd 
							
						 
					 
					
						
						
							
							* Fix loading of gazetteer.json file  
						
						
						
					 
					
						2015-08-06 16:08:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9c667b7f15 
							
						 
					 
					
						
						
							
							* Set a value in attrs.pxd on the first flag, to reduce bugs  
						
						
						
					 
					
						2015-08-06 16:08:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c263577424 
							
						 
					 
					
						
						
							
							* Fix lower attribute in lexeme.pxd  
						
						
						
					 
					
						2015-08-06 16:07:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5737115e1e 
							
						 
					 
					
						
						
							
							* Work on gazetteer matching  
						
						
						
					 
					
						2015-08-06 14:33:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9c1724ecae 
							
						 
					 
					
						
						
							
							* Gazetteer stuff working, now need to wire up to API  
						
						
						
					 
					
						2015-08-06 00:35:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5bc0e83f9a 
							
						 
					 
					
						
						
							
							* Reimplement matching in Cython, instead of Python.  
						
						
						
					 
					
						2015-08-05 01:05:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4c87a696b3 
							
						 
					 
					
						
						
							
							* Add draft dfa matcher, in Python. Passing tests.  
						
						
						
					 
					
						2015-08-04 15:55:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							eb7138c761 
							
						 
					 
					
						
						
							
							* Add attr relation in base NP detection  
						
						
						
					 
					
						2015-08-01 00:34:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4988356cf0 
							
						 
					 
					
						
						
							
							* Fix dependency type bug from merged tokens  
						
						
						
					 
					
						2015-08-01 00:33:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							78a9068319 
							
						 
					 
					
						
						
							
							* Fix spacy attr on merged tokens  
						
						
						
					 
					
						2015-07-30 04:25:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							430e2edb96 
							
						 
					 
					
						
						
							
							* Fix noun_chunks issue  
						
						
						
					 
					
						2015-07-30 03:51:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9590968fc1 
							
						 
					 
					
						
						
							
							* Fix negative indices in Span  
						
						
						
					 
					
						2015-07-30 02:30:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							74d8cb3980 
							
						 
					 
					
						
						
							
							* Add noun_chunks iterator, and fix left/right child setting in Doc.merge  
						
						
						
					 
					
						2015-07-30 02:29:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d153f18969 
							
						 
					 
					
						
						
							
							* Fix negative indices on spans  
						
						
						
					 
					
						2015-07-29 22:36:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b5132bed7d 
							
						 
					 
					
						
						
							
							* Set left and right children when loading parse from byte string  
						
						
						
					 
					
						2015-07-28 21:03:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6609fcf4b2 
							
						 
					 
					
						
						
							
							* Make mem and vocab python-visible in Doc  
						
						
						
					 
					
						2015-07-28 20:46:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d42fe2e694 
							
						 
					 
					
						
						
							
							* Add unicode_literals to strings.pyx  
						
						
						
					 
					
						2015-07-28 16:15:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb910cff92 
							
						 
					 
					
						
						
							
							* Fix Python3 problem in align_raw  
						
						
						
					 
					
						2015-07-28 16:06:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dcafb181b9 
							
						 
					 
					
						
						
							
							* Fix Python3 problem in align_raw  
						
						
						
					 
					
						2015-07-28 15:52:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c609ea18f0 
							
						 
					 
					
						
						
							
							* Increment version in download script  
						
						
						
					 
					
						2015-07-28 15:22:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9c4d0aae62 
							
						 
					 
					
						
						
							
							* Switch to better Python2/3 compatible unicode handling  
						
						
						
					 
					
						2015-07-28 14:45:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7606d9936f 
							
						 
					 
					
						
						
							
							* Python3 correction for GoldParse  
						
						
						
					 
					
						2015-07-28 14:44:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ddc1a5cfe5 
							
						 
					 
					
						
						
							
							* Fix training under python3  
						
						
						
					 
					
						2015-07-28 14:09:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a8bbd7312c 
							
						 
					 
					
						
						
							
							* Hackishly patch long dependencies problem  
						
						
						
					 
					
						2015-07-28 00:14:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb583f7f09 
							
						 
					 
					
						
						
							
							* Hackishly patch long dependencies problem  
						
						
						
					 
					
						2015-07-27 23:14:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aa7a964a4f 
							
						 
					 
					
						
						
							
							* Add a type declaration for doc.from_array  
						
						
						
					 
					
						2015-07-27 22:57:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							25a8774f42 
							
						 
					 
					
						
						
							
							* Fix regression in packer  
						
						
						
					 
					
						2015-07-27 21:53:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1601e488ee 
							
						 
					 
					
						
						
							
							* Fix bug in decoding non-ascii characters  
						
						
						
					 
					
						2015-07-27 21:43:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a95409cd2 
							
						 
					 
					
						
						
							
							* Fix type on bits  
						
						
						
					 
					
						2015-07-27 21:16:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a296d72b54 
							
						 
					 
					
						
						
							
							* Fix en/attrs  
						
						
						
					 
					
						2015-07-27 21:16:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							45460f505c 
							
						 
					 
					
						
						
							
							* Fix data type on read32 in BitArray  
						
						
						
					 
					
						2015-07-27 21:12:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d43f49f69 
							
						 
					 
					
						
						
							
							* Revert prev change  
						
						
						
					 
					
						2015-07-27 10:58:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b586cdad4 
							
						 
					 
					
						
						
							
							* Change lexemes.bin format. Add a header specifying size of LexemeC and number of lexemes, and don't have the redundant orth information.  
						
						
						
					 
					
						2015-07-27 08:31:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af6ed18f2a 
							
						 
					 
					
						
						
							
							* Ensure we don't use orth_encode on OOV words.  
						
						
						
					 
					
						2015-07-27 02:12:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8535d872e8 
							
						 
					 
					
						
						
							
							* Set is_oov property in get_flags  
						
						
						
					 
					
						2015-07-27 01:51:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8e4c69ee8c 
							
						 
					 
					
						
						
							
							* Add is_oov property, and fix up handling of attributes  
						
						
						
					 
					
						2015-07-27 01:50:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fc268f03eb 
							
						 
					 
					
						
						
							
							* Assert against null pointer exceptions in vocab  
						
						
						
					 
					
						2015-07-27 01:00:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0f093fdb30 
							
						 
					 
					
						
						
							
							* Fix get_by_orth for py3  
						
						
						
					 
					
						2015-07-26 19:26:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ceeda5a739 
							
						 
					 
					
						
						
							
							* Fix get_by_orth for py3  
						
						
						
					 
					
						2015-07-26 18:39:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6bb96c122d 
							
						 
					 
					
						
						
							
							* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects  
						
						
						
					 
					
						2015-07-26 16:37:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							eeaea25f0c 
							
						 
					 
					
						
						
							
							* Check oov_prob file is present  
						
						
						
					 
					
						2015-07-26 16:36:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7eb2446082 
							
						 
					 
					
						
						
							
							* Return empty lexeme on empty string  
						
						
						
					 
					
						2015-07-26 00:18:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1b5d1da2a7 
							
						 
					 
					
						
						
							
							* Allow an OOV probability to be specified in get_lex_props  
						
						
						
					 
					
						2015-07-26 00:03:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cd6e25132b 
							
						 
					 
					
						
						
							
							* Allow an OOV probability to be specified in get_lex_props  
						
						
						
					 
					
						2015-07-26 00:01:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd525f0675 
							
						 
					 
					
						
						
							
							* Pass OOV probability around  
						
						
						
					 
					
						2015-07-25 23:29:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3fe14b8ed6 
							
						 
					 
					
						
						
							
							* Fix CFile for Python2  
						
						
						
					 
					
						2015-07-25 22:55:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							823ef4a00b 
							
						 
					 
					
						
						
							
							* Remove profile declarations  
						
						
						
					 
					
						2015-07-25 18:13:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f4809e562f 
							
						 
					 
					
						
						
							
							* Allow json to be used as a fallback if ujson is not available  
						
						
						
					 
					
						2015-07-25 18:11:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9da06671cf 
							
						 
					 
					
						
						
							
							* Remove unused import  
						
						
						
					 
					
						2015-07-25 18:11:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2060935cdb 
							
						 
					 
					
						
						
							
							* Remove explicit bytes type in doc.from_bytes, to accept bytearray  
						
						
						
					 
					
						2015-07-24 04:54:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aa28e2e01d 
							
						 
					 
					
						
						
							
							* Release the GIL around parse function  
						
						
						
					 
					
						2015-07-24 04:53:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d62eb34b76 
							
						 
					 
					
						
						
							
							* More Py 2/3 compatibility in bit strings  
						
						
						
					 
					
						2015-07-24 04:52:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0bb839d299 
							
						 
					 
					
						
						
							
							* Fix string coercion for Python 3  
						
						
						
					 
					
						2015-07-24 03:49:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c4ff410fdb 
							
						 
					 
					
						
						
							
							* Fix bytes problems for Python3  
						
						
						
					 
					
						2015-07-24 03:48:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1ab25e4dad 
							
						 
					 
					
						
						
							
							* Fix python3 type error  
						
						
						
					 
					
						2015-07-24 02:45:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f35ff173b0 
							
						 
					 
					
						
						
							
							* Fix bits.pyx unicode error  
						
						
						
					 
					
						2015-07-23 20:37:57 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1406e24327 
							
						 
					 
					
						
						
							
							* Fix unicode error for Python3  
						
						
						
					 
					
						2015-07-23 19:36:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dbda6c27fa 
							
						 
					 
					
						
						
							
							* Fix python3 error  
						
						
						
					 
					
						2015-07-23 14:52:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							99387f9572 
							
						 
					 
					
						
						
							
							* Fix python3 error  
						
						
						
					 
					
						2015-07-23 14:30:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b81ffe9032 
							
						 
					 
					
						
						
							
							* Fix typing on mode string in CFile  
						
						
						
					 
					
						2015-07-23 13:24:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							22028602a9 
							
						 
					 
					
						
						
							
							* Add unicode_literals declaration in vocab.pyx  
						
						
						
					 
					
						2015-07-23 13:24:20 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b41744270 
							
						 
					 
					
						
						
							
							* Check for directory presence before loading annotators  
						
						
						
					 
					
						2015-07-23 09:27:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df01a88763 
							
						 
					 
					
						
						
							
							Merge branch 'refactor' (and serializaton)  
						
						... 
						
						
						
						Add Huffman-code serialization, and do a lot of
refactoring. Highlights include:
* Much more efficient StringStore
* Vocab maintains a by-orth mapping of Lexemes
* Avoid manually slicing Py_UNICODE buffers,
  simplifying tokenizer and vocab C APIs
* Remove various bits of dead code
* Work on removing GIL around parser
* Work on bridge to Theano
Conflicts:
	spacy/strings.pxd
	spacy/strings.pyx
	spacy/structs.pxd 
						
					 
					
						2015-07-23 02:18:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7c4d72e83 
							
						 
					 
					
						
						
							
							* Add serializer property to Vocab, and lazy-load it. Add get_by_orth method.  
						
						
						
					 
					
						2015-07-23 01:18:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ab1696b15 
							
						 
					 
					
						
						
							
							* Remove read_encoding_freqs from util.py  
						
						
						
					 
					
						2015-07-23 01:17:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d5255aad77 
							
						 
					 
					
						
						
							
							* Update freqs for missing tags in ner, for serializer  
						
						
						
					 
					
						2015-07-23 01:17:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							12699a1152 
							
						 
					 
					
						
						
							
							* Set initial freqs, to avoid missing values in serializer  
						
						
						
					 
					
						2015-07-23 01:16:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							680bb47b55 
							
						 
					 
					
						
						
							
							* Write serializer freqs to single file, vocab/serializer.json  
						
						
						
					 
					
						2015-07-23 01:15:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a0e36e8efc 
							
						 
					 
					
						
						
							
							* Add working to/from bytes API to Doc  
						
						
						
					 
					
						2015-07-23 01:14:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1f31d96bf9 
							
						 
					 
					
						
						
							
							* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway.  
						
						
						
					 
					
						2015-07-23 01:13:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							38ef986b29 
							
						 
					 
					
						
						
							
							* Update spacy/en/attrs.pxd  
						
						
						
					 
					
						2015-07-23 01:10:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							06eac32610 
							
						 
					 
					
						
						
							
							* Add cfile.pyx  
						
						
						
					 
					
						2015-07-23 01:10:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0c507bd80a 
							
						 
					 
					
						
						
							
							* Fix tokenizer  
						
						
						
					 
					
						2015-07-22 14:10:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c86dbe4944 
							
						 
					 
					
						
						
							
							* Update English.save_models for new Packer save/load stuff  
						
						
						
					 
					
						2015-07-22 13:40:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bf77bcd6b9 
							
						 
					 
					
						
						
							
							* Add comment explaining hash_string  
						
						
						
					 
					
						2015-07-22 13:39:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							815bda201d 
							
						 
					 
					
						
						
							
							* Remove UniStr struct  
						
						
						
					 
					
						2015-07-22 13:39:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2fc66e3723 
							
						 
					 
					
						
						
							
							* Use Py_UNICODE in tokenizer for now, while sort out Py_UCS4 stuff  
						
						
						
					 
					
						2015-07-22 13:38:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4d61239eac 
							
						 
					 
					
						
						
							
							* Reorganize the serialization functions on Doc  
						
						
						
					 
					
						2015-07-22 04:53:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							109106a949 
							
						 
					 
					
						
						
							
							* Replace UniStr, using unicode objects instead  
						
						
						
					 
					
						2015-07-22 04:52:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							424854028f 
							
						 
					 
					
						
						
							
							* Fix decode_int32  
						
						
						
					 
					
						2015-07-21 20:09:59 +00:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							304d0e2633 
							
						 
					 
					
						
						
							
							* Use decode_int32 in _orth_decode  
						
						
						
					 
					
						2015-07-21 20:40:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9cfa59ec33 
							
						 
					 
					
						
						
							
							* Optimistically try orth encoding, with char as a back-off  
						
						
						
					 
					
						2015-07-21 20:22:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c8b89e37a5 
							
						 
					 
					
						
						
							
							* Bug fix to faster huffman decoding  
						
						
						
					 
					
						2015-07-21 20:05:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b166d1d2a2 
							
						 
					 
					
						
						
							
							* Use encode32 and decode32  
						
						
						
					 
					
						2015-07-21 19:59:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c6cd0ddce8 
							
						 
					 
					
						
						
							
							* Add faster encode_int32 and decode_int32 methods  
						
						
						
					 
					
						2015-07-21 19:58:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dd60594f41 
							
						 
					 
					
						
						
							
							* Fix double encoding error in strings.pyx  
						
						
						
					 
					
						2015-07-20 13:52:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							06639dc497 
							
						 
					 
					
						
						
							
							* Add length cap to word shape feature  
						
						
						
					 
					
						2015-07-20 12:06:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							128b6d9714 
							
						 
					 
					
						
						
							
							* Move Utf8Str struct to strings module, as that's the only place it's relevant  
						
						
						
					 
					
						2015-07-20 12:06:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01a97b90f3 
							
						 
					 
					
						
						
							
							* Fix header for string store  
						
						
						
					 
					
						2015-07-20 12:06:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							52d538ea42 
							
						 
					 
					
						
						
							
							* Fix short string optimization in strings.pyx. StringStore tests now all pass.  
						
						
						
					 
					
						2015-07-20 12:05:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							09a3055630 
							
						 
					 
					
						
						
							
							* Work on short string optimization in Utf8Str  
						
						
						
					 
					
						2015-07-20 11:26:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb0ba1f0cd 
							
						 
					 
					
						
						
							
							* Improve serialization speed  
						
						
						
					 
					
						2015-07-20 03:27:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8743a8c084 
							
						 
					 
					
						
						
							
							* Update Doc serialization for new Packer interface  
						
						
						
					 
					
						2015-07-20 01:38:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1f7170e0e1 
							
						 
					 
					
						
						
							
							* Reinstate the fixed vocabulary --- words are only added to the lexicon in init_model, after that we create LexemeC structs with the Pool given to us.  
						
						
						
					 
					
						2015-07-20 01:37:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5a7d060d9c 
							
						 
					 
					
						
						
							
							* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words.  
						
						
						
					 
					
						2015-07-20 01:36:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5a042ee0d3 
							
						 
					 
					
						
						
							
							* Add function to predict number of bits needed to encode message  
						
						
						
					 
					
						2015-07-20 01:35:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b89b489bb4 
							
						 
					 
					
						
						
							
							* Implement both character and orth encoding in Packer, so that we can decide which to use per-text  
						
						
						
					 
					
						2015-07-19 22:39:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ae78c9e3ce 
							
						 
					 
					
						
						
							
							* Implement character-based codec, so that we can do word/char backoff  
						
						
						
					 
					
						2015-07-19 22:03:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cd1d047cb8 
							
						 
					 
					
						
						
							
							* Delete out-dated HuffmanCodec comment  
						
						
						
					 
					
						2015-07-19 18:28:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b8086067d5 
							
						 
					 
					
						
						
							
							* Build Huffman codec from unsorted inputs  
						
						
						
					 
					
						2015-07-19 17:58:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							317cbbc015 
							
						 
					 
					
						
						
							
							* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.  
						
						
						
					 
					
						2015-07-19 15:18:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b13e7227c 
							
						 
					 
					
						
						
							
							* Remove duplicate get_lex_attr method from doc.pyx  
						
						
						
					 
					
						2015-07-18 22:46:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e49c7f1478 
							
						 
					 
					
						
						
							
							* Update oov check in tokenizer  
						
						
						
					 
					
						2015-07-18 22:45:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cfd842769e 
							
						 
					 
					
						
						
							
							* Allow infix tokens to be variable length  
						
						
						
					 
					
						2015-07-18 22:45:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b4c78bbb2 
							
						 
					 
					
						
						
							
							* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.  
						
						
						
					 
					
						2015-07-18 22:43:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							82d84b0f2b 
							
						 
					 
					
						
						
							
							* Index lexemes by orth, instead of a lexemes vector. Breaks the mechanism for deciding not to own LexemeC structs during parsing. Need to reinstate this.  
						
						
						
					 
					
						2015-07-18 22:42:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4dddc8a69b 
							
						 
					 
					
						
						
							
							* Fix type declarations for attr_t. Remove unused id_t.  
						
						
						
					 
					
						2015-07-18 22:39:57 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ced59ab9ea 
							
						 
					 
					
						
						
							
							* Make minor efficiency improvement in Doc.__iter__  
						
						
						
					 
					
						2015-07-18 04:10:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cd91914dd8 
							
						 
					 
					
						
						
							
							* Fix hard-coded length  
						
						
						
					 
					
						2015-07-18 04:09:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b1d74ce60d 
							
						 
					 
					
						
						
							
							* Remove unused joint.pyx and joint.pxd files  
						
						
						
					 
					
						2015-07-17 23:31:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c27514512b 
							
						 
					 
					
						
						
							
							* Remove cruft ner/ directory  
						
						
						
					 
					
						2015-07-17 23:24:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8d6d319f4 
							
						 
					 
					
						
						
							
							* Remove cruft module  
						
						
						
					 
					
						2015-07-17 23:23:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fb0a641a2d 
							
						 
					 
					
						
						
							
							* Don't release the gil around Parser.parse. Does this indicate thread problems?  
						
						
						
					 
					
						2015-07-17 23:07:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e29daea85f 
							
						 
					 
					
						
						
							
							* Fix bint/int typing problem in TransitionSystem. In C++ bint* means bool*, but in C it means int*. So, type-casting to bint* is unsafe.  
						
						
						
					 
					
						2015-07-17 22:37:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cf0c788892 
							
						 
					 
					
						
						
							
							* Tests passing on round-trip pack/unpack on basic example  
						
						
						
					 
					
						2015-07-17 21:20:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							44f39a876f 
							
						 
					 
					
						
						
							
							* Add a blank attrs.pyx  
						
						
						
					 
					
						2015-07-17 16:40:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c2c83120d4 
							
						 
					 
					
						
						
							
							* Remove codec property from Vocab  
						
						
						
					 
					
						2015-07-17 16:40:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dfdf19f6a9 
							
						 
					 
					
						
						
							
							* Draft a from_orth method for Doc  
						
						
						
					 
					
						2015-07-17 16:39:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9e3f17051b 
							
						 
					 
					
						
						
							
							* Move to ORTH instead of ID for encoding lexemes. Basic tests of the codec wrappers now passing  
						
						
						
					 
					
						2015-07-17 16:38:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							15ff739996 
							
						 
					 
					
						
						
							
							* Fix passing of ID attribute in string store  
						
						
						
					 
					
						2015-07-17 14:49:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							95e57c2780 
							
						 
					 
					
						
						
							
							* Remove unnecessary key and id properties from Utf8String.  
						
						
						
					 
					
						2015-07-17 01:40:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							234c7e440a 
							
						 
					 
					
						
						
							
							* Add spacy/serialize/__init__ files  
						
						
						
					 
					
						2015-07-17 01:37:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							db9dfd2e23 
							
						 
					 
					
						
						
							
							* Major refactor of serialization. Nearly complete now.  
						
						
						
					 
					
						2015-07-17 01:27:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c8282f9934 
							
						 
					 
					
						
						
							
							* Work on serialization. Needs more reorganisation  
						
						
						
					 
					
						2015-07-16 19:56:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d8458d6a25 
							
						 
					 
					
						
						
							
							* Fix attr_id_t import in Spans  
						
						
						
					 
					
						2015-07-16 19:55:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d1cb30dbc4 
							
						 
					 
					
						
						
							
							* Remove unnecessary key and id properties from Utf8String.  
						
						
						
					 
					
						2015-07-16 19:29:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							897de2d438 
							
						 
					 
					
						
						
							
							* Add 'bitter' property for serializer in English class  
						
						
						
					 
					
						2015-07-16 17:47:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fb54052ae0 
							
						 
					 
					
						
						
							
							* Work on serializer design  
						
						
						
					 
					
						2015-07-16 17:46:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a6f401580d 
							
						 
					 
					
						
						
							
							* Add from_array function to Doc.  
						
						
						
					 
					
						2015-07-16 17:46:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a5d050134 
							
						 
					 
					
						
						
							
							* Give codec loading back to Vocab.  
						
						
						
					 
					
						2015-07-16 17:45:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8bf0f65f1c 
							
						 
					 
					
						
						
							
							* Remove dead code in strings.pyx  
						
						
						
					 
					
						2015-07-16 17:35:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a9c3863665 
							
						 
					 
					
						
						
							
							* Fix inefficiency in StringStore.dump function  
						
						
						
					 
					
						2015-07-16 17:34:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b59d271510 
							
						 
					 
					
						
						
							
							* Move serialization functionality into Serializer class  
						
						
						
					 
					
						2015-07-16 11:23:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							30be4f15da 
							
						 
					 
					
						
						
							
							* Import attrs from spacy.attrs, not spacy.typedefs  
						
						
						
					 
					
						2015-07-16 11:23:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c99e5f4aa 
							
						 
					 
					
						
						
							
							* Move serialization into Serializer class, with __call__ and train() api  
						
						
						
					 
					
						2015-07-16 11:22:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e2133d990e 
							
						 
					 
					
						
						
							
							* Move serialization functionality out into a Serializer object  
						
						
						
					 
					
						2015-07-16 11:21:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a6d040bd11 
							
						 
					 
					
						
						
							
							* Import Lexeme attrs from spacy.attrs, not spacy.typedefs  
						
						
						
					 
					
						2015-07-16 11:20:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							45ae1ce428 
							
						 
					 
					
						
						
							
							* Remove unused declaration in parser  
						
						
						
					 
					
						2015-07-16 01:27:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							efa80096f1 
							
						 
					 
					
						
						
							
							* Upd attrs id list  
						
						
						
					 
					
						2015-07-16 01:26:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01fab6bb90 
							
						 
					 
					
						
						
							
							* Improve de/serialize functions  
						
						
						
					 
					
						2015-07-16 01:26:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0e07c1ed2a 
							
						 
					 
					
						
						
							
							* draft de/serialization functions in doc.pyx  
						
						
						
					 
					
						2015-07-16 01:16:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9d956b07e9 
							
						 
					 
					
						
						
							
							* Fix import of attrs in doc.pyx, and update the get_token_attr function.  
						
						
						
					 
					
						2015-07-16 01:15:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							65251e7625 
							
						 
					 
					
						
						
							
							* Remove redundant attr_id_t from typedefs.pxd  
						
						
						
					 
					
						2015-07-16 00:58:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9a8db9743c 
							
						 
					 
					
						
						
							
							* Remove gil from parser.call  
						
						
						
					 
					
						2015-07-14 23:47:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							38ca0c33f5 
							
						 
					 
					
						
						
							
							Merge branch 'neuralnet' into refactor  
						
						... 
						
						
						
						Mostly refactors parser, to use new thinc3.2 Example class.
Aim is to remove use of shared memory, so that we can parallelize
over documents easily.
Conflicts:
	setup.py
	spacy/syntax/parser.pxd
	spacy/syntax/parser.pyx
	spacy/syntax/stateclass.pyx 
						
					 
					
						2015-07-14 14:13:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							935ac53ee3 
							
						 
					 
					
						
						
							
							* Extend count_by method  
						
						
						
					 
					
						2015-07-14 03:20:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3b5baa660f 
							
						 
					 
					
						
						
							
							* Fix tokenizer  
						
						
						
					 
					
						2015-07-14 00:10:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2ae0b439b2 
							
						 
					 
					
						
						
							
							* Fix space check in gold.pyx  
						
						
						
					 
					
						2015-07-14 00:10:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							81aa4e6dcc 
							
						 
					 
					
						
						
							
							* Go back to having token reference doc, instead of complicated gymnastics. Rename the attr 'doc', to expose it in the API  
						
						
						
					 
					
						2015-07-14 00:10:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							24d6ce99ec 
							
						 
					 
					
						
						
							
							* Add comment to tokenizer, explaining the spacy attr  
						
						
						
					 
					
						2015-07-13 22:29:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8214b74eec 
							
						 
					 
					
						
						
							
							* Restore _py_tokens cache, to handle orphan tokens.  
						
						
						
					 
					
						2015-07-13 22:28:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							67641f3b58 
							
						 
					 
					
						
						
							
							* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string  
						
						
						
					 
					
						2015-07-13 21:46:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6eef0bf9ab 
							
						 
					 
					
						
						
							
							* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx  
						
						
						
					 
					
						2015-07-13 20:20:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3ea8756c24 
							
						 
					 
					
						
						
							
							* Add spacy/tokens/doc.pyx, for Doc class in its own file  
						
						
						
					 
					
						2015-07-13 19:58:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c99387155f 
							
						 
					 
					
						
						
							
							* Refactor tokens, moving classes into a module instead of a single file  
						
						
						
					 
					
						2015-07-13 19:49:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d27899658e 
							
						 
					 
					
						
						
							
							* Import classes in spacy.tokens.__init__  
						
						
						
					 
					
						2015-07-13 19:48:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aa82caf8f5 
							
						 
					 
					
						
						
							
							* Add TokenC.spacy attr  
						
						
						
					 
					
						2015-07-13 19:48:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dba6b47d4e 
							
						 
					 
					
						
						
							
							* Refactor monster tokens.pyx file, into a tokens/ subpackage. Try to break the cycle between Doc and Token, and remove the need to pass around a unicode string reference  
						
						
						
					 
					
						2015-07-13 19:20:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b0a7190c9 
							
						 
					 
					
						
						
							
							* Round-trip for serialization finally working. Needs a lot of optimization.  
						
						
						
					 
					
						2015-07-13 18:39:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							edd371246c 
							
						 
					 
					
						
						
							
							* Make huffman coder take BitArray in encode/decode. Add __iter__ method to BitArray.  
						
						
						
					 
					
						2015-07-13 17:33:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af5cc926a4 
							
						 
					 
					
						
						
							
							* Add codec property to Vocab, to use the Huffman encoding  
						
						
						
					 
					
						2015-07-13 13:55:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							77385d5580 
							
						 
					 
					
						
						
							
							* Make .pxd file for huffman codec  
						
						
						
					 
					
						2015-07-13 13:54:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							083b6ea7ae 
							
						 
					 
					
						
						
							
							* Clean up encoder a bit. now read for integration into Vocab.  
						
						
						
					 
					
						2015-07-13 12:57:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8d0f1d98da 
							
						 
					 
					
						
						
							
							* Draft dockstring for HuffmanCache  
						
						
						
					 
					
						2015-07-13 12:01:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							281f1faefb 
							
						 
					 
					
						
						
							
							* Nearly finished huffman coder  
						
						
						
					 
					
						2015-07-12 23:48:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1a25fba32 
							
						 
					 
					
						
						
							
							* Work on huffman coder  
						
						
						
					 
					
						2015-07-12 19:58:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3fb9de2d13 
							
						 
					 
					
						
						
							
							* Remove vector[bint], in favor of simple Code struct.  
						
						
						
					 
					
						2015-07-12 17:58:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aa7bfd932b 
							
						 
					 
					
						
						
							
							* Work on compressor  
						
						
						
					 
					
						2015-07-12 16:03:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							14eafcab15 
							
						 
					 
					
						
						
							
							* Refactor to use vector[bint]  
						
						
						
					 
					
						2015-07-12 05:27:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a6e852a39 
							
						 
					 
					
						
						
							
							* Refactor huffman coding stuff into class  
						
						
						
					 
					
						2015-07-12 05:06:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aad96fdb5c 
							
						 
					 
					
						
						
							
							* Improve efficiency of huffman coding  
						
						
						
					 
					
						2015-07-12 01:31:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff9ff6f3fa 
							
						 
					 
					
						
						
							
							* Ensure unseen words are given low log probability  
						
						
						
					 
					
						2015-07-12 01:31:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9d3b0d83de 
							
						 
					 
					
						
						
							
							* Refactor huffman coding  
						
						
						
					 
					
						2015-07-11 22:27:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8d29406cd6 
							
						 
					 
					
						
						
							
							* Rename span.right to span.rights  
						
						
						
					 
					
						2015-07-11 22:15:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							da9f358166 
							
						 
					 
					
						
						
							
							* Fix span getting  
						
						
						
					 
					
						2015-07-11 21:41:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							11e8f2ffb4 
							
						 
					 
					
						
						
							
							* Huffman codes working  
						
						
						
					 
					
						2015-07-11 20:01:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cb6fc81909 
							
						 
					 
					
						
						
							
							* Work on huffman coding.  
						
						
						
					 
					
						2015-07-11 15:23:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4c9b77fe95 
							
						 
					 
					
						
						
							
							* Begin working on serialization code  
						
						
						
					 
					
						2015-07-11 10:57:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							53d1f5b2eb 
							
						 
					 
					
						
						
							
							* Rename Span.head to Span.root.  
						
						
						
					 
					
						2015-07-09 17:30:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c0255ed7d8 
							
						 
					 
					
						
						
							
							* Allow slice indexing in Doc.__getitem__, returning a Span object  
						
						
						
					 
					
						2015-07-09 15:15:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							89a91ad726 
							
						 
					 
					
						
						
							
							* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity  
						
						
						
					 
					
						2015-07-09 13:30:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							55f1042443 
							
						 
					 
					
						
						
							
							* Improve efficiency of L and R features, correcting the non-linear-in-length problem.  
						
						
						
					 
					
						2015-07-09 12:17:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							70d2acb579 
							
						 
					 
					
						
						
							
							* Fix edge features  
						
						
						
					 
					
						2015-07-09 12:15:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							adb868bdad 
							
						 
					 
					
						
						
							
							* Add warning for models not found in parser  
						
						
						
					 
					
						2015-07-08 20:04:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							05b28ec9eb 
							
						 
					 
					
						
						
							
							* Add warning for models not found in parser  
						
						
						
					 
					
						2015-07-08 20:02:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ef700401a6 
							
						 
					 
					
						
						
							
							* Add warning for models not found in parser  
						
						
						
					 
					
						2015-07-08 20:00:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6218d8b389 
							
						 
					 
					
						
						
							
							* Add warning for models not found in parser  
						
						
						
					 
					
						2015-07-08 19:59:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f6a6c39ce8 
							
						 
					 
					
						
						
							
							* Add warning for models not found in parser  
						
						
						
					 
					
						2015-07-08 19:52:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							78db7e32f7 
							
						 
					 
					
						
						
							
							* Remove has_sense method from Lexeme declaration  
						
						
						
					 
					
						2015-07-08 19:41:20 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ddb2f5e45 
							
						 
					 
					
						
						
							
							* Restore merge_mwe in English class  
						
						
						
					 
					
						2015-07-08 19:35:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6859f6adac 
							
						 
					 
					
						
						
							
							* Restore merge_mwe in English class  
						
						
						
					 
					
						2015-07-08 19:34:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3c270fc8ff 
							
						 
					 
					
						
						
							
							* Remove has_sense method from Lexeme  
						
						
						
					 
					
						2015-07-08 19:28:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b64c843861 
							
						 
					 
					
						
						
							
							* Remove senses attr  
						
						
						
					 
					
						2015-07-08 19:26:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1d3a592edf 
							
						 
					 
					
						
						
							
							* Remove the senses attr from LexemeC, to keep data compatibility  
						
						
						
					 
					
						2015-07-08 19:24:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0ceb1f71c2 
							
						 
					 
					
						
						
							
							* Update parse features  
						
						
						
					 
					
						2015-07-08 19:11:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e51b5027a 
							
						 
					 
					
						
						
							
							* Alias Doc to Tokens, for backwards compatibility  
						
						
						
					 
					
						2015-07-08 18:59:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e3c53f5ecd 
							
						 
					 
					
						
						
							
							* Fix mention of Tokens in docstring  
						
						
						
					 
					
						2015-07-08 18:56:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb522496dd 
							
						 
					 
					
						
						
							
							* Rename Tokens to Doc  
						
						
						
					 
					
						2015-07-08 18:53:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b24e8be2b9 
							
						 
					 
					
						
						
							
							* Whitespace in docstring  
						
						
						
					 
					
						2015-07-08 12:37:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abc43b852d 
							
						 
					 
					
						
						
							
							* Add pos_tags attr to Vocab.  
						
						
						
					 
					
						2015-07-08 12:36:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							935bcdf3e5 
							
						 
					 
					
						
						
							
							* Remove redundant tag_names argument to Tokenizer  
						
						
						
					 
					
						2015-07-08 12:36:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff885e8511 
							
						 
					 
					
						
						
							
							* Add ParserFactory convenience function  
						
						
						
					 
					
						2015-07-08 12:35:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e4fac452b 
							
						 
					 
					
						
						
							
							* Refactor __init__ for simplicity. Allow parse=True, tag=True etc flags to be passed at top-level. Do not lazy-load parser.  
						
						
						
					 
					
						2015-07-08 12:35:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1d2deb4616 
							
						 
					 
					
						
						
							
							* Work on refactoring default arguments to English.__init__  
						
						
						
					 
					
						2015-07-07 15:53:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2d0e99a096 
							
						 
					 
					
						
						
							
							* Pass pos_tags into Tokenizer.from_dir  
						
						
						
					 
					
						2015-07-07 14:23:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6788c86b2f 
							
						 
					 
					
						
						
							
							* Begin refactor  
						
						
						
					 
					
						2015-07-07 14:00:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							52fd80c6c6 
							
						 
					 
					
						
						
							
							* Add experimental supersense features for parsing, based on lookup into wordnet.  
						
						
						
					 
					
						2015-07-01 20:12:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e6d828a9af 
							
						 
					 
					
						
						
							
							* Set up an array POS_SENSES that denotes the set of valid senses for each POS tag. This way, we can do bitwise & between a lexeme's senses and the ones available for its POS tag, to get the allowable senses for the token.  
						
						
						
					 
					
						2015-07-01 20:12:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2b8459d9a8 
							
						 
					 
					
						
						
							
							* Add senses flag to Lexeme  
						
						
						
					 
					
						2015-07-01 20:10:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e23d1582a2 
							
						 
					 
					
						
						
							
							* Add supersense data to Lexeme objects. Add simple has_sense method to check the flag.  
						
						
						
					 
					
						2015-07-01 18:50:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							64fafa98be 
							
						 
					 
					
						
						
							
							* Add senses.pyx and senses.pxd  
						
						
						
					 
					
						2015-07-01 18:49:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							94dab94e5f 
							
						 
					 
					
						
						
							
							uerge branch 'master' of  https://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-06-30 18:16:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9af86b0b0b 
							
						 
					 
					
						
						
							
							* Fix attrs.pxd  
						
						
						
					 
					
						2015-06-30 18:16:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af9c82f7a6 
							
						 
					 
					
						
						
							
							Merge branch 'master' of  https://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-06-30 18:11:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5d595b5a8c 
							
						 
					 
					
						
						
							
							* Inc versions  
						
						
						
					 
					
						2015-06-30 18:11:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d2eeba6667 
							
						 
					 
					
						
						
							
							* Start wiring up color and emotion lexicons. Hopefully we get to use them.  
						
						
						
					 
					
						2015-06-30 16:22:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e20106fdff 
							
						 
					 
					
						
						
							
							* Begin reorganizing neuralnet work  
						
						
						
					 
					
						2015-06-30 14:26:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5cd3ed42d4 
							
						 
					 
					
						
						
							
							* Reenable averaging  
						
						
						
					 
					
						2015-06-29 16:44:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							894cbef8ba 
							
						 
					 
					
						
						
							
							* Wire eta and mu parameters up for neural net  
						
						
						
					 
					
						2015-06-29 07:10:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3bb5876c5a 
							
						 
					 
					
						
						
							
							* Inline methods in StateClass  
						
						
						
					 
					
						2015-06-29 01:10:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							313a7f87b3 
							
						 
					 
					
						
						
							
							* Inline methods in StateClass  
						
						
						
					 
					
						2015-06-29 01:06:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a02fd3af5d 
							
						 
					 
					
						
						
							
							* Check valency in L and R feature methods, to make feaure calculation faster  
						
						
						
					 
					
						2015-06-29 00:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5d870720bc 
							
						 
					 
					
						
						
							
							* Check valency in L and R feature methods, to make feaure calculation faster  
						
						
						
					 
					
						2015-06-29 00:17:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f4986d5d3c 
							
						 
					 
					
						
						
							
							* Use new Example class  
						
						
						
					 
					
						2015-06-28 22:36:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							735f1af91f 
							
						 
					 
					
						
						
							
							* Fix neural net stuff  
						
						
						
					 
					
						2015-06-28 11:44:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e7003f1cf3 
							
						 
					 
					
						
						
							
							* Remove hard-coding of vector lengths  
						
						
						
					 
					
						2015-06-28 11:37:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							897dd0dd0b 
							
						 
					 
					
						
						
							
							* Merge changes, and adjust Example to use memoryview  
						
						
						
					 
					
						2015-06-28 11:36:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9282a8e72c 
							
						 
					 
					
						
						
							
							* Prepare for new models to be plugged in by using Example class  
						
						
						
					 
					
						2015-06-28 11:02:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							75aeccc064 
							
						 
					 
					
						
						
							
							* Rejig parser interface to use new thinc.api.Example class, in prep of theano model. Comment out beam search  
						
						
						
					 
					
						2015-06-28 11:02:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bf33598b34 
							
						 
					 
					
						
						
							
							* Work on a theano-driven model for the parser  
						
						
						
					 
					
						2015-06-28 11:02:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bbef71f213 
							
						 
					 
					
						
						
							
							* Fix min function in fill_context  
						
						
						
					 
					
						2015-06-28 10:46:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							142b6f9510 
							
						 
					 
					
						
						
							
							* Revert last changes  
						
						
						
					 
					
						2015-06-28 10:44:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b06962f18b 
							
						 
					 
					
						
						
							
							* Pad buffers in state  
						
						
						
					 
					
						2015-06-28 10:36:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							53be72387c 
							
						 
					 
					
						
						
							
							* Hack at fill_context to investigate performance loss  
						
						
						
					 
					
						2015-06-28 10:34:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							71a4e876a9 
							
						 
					 
					
						
						
							
							* Fix parse features  
						
						
						
					 
					
						2015-06-28 09:27:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0c4b5a2bb0 
							
						 
					 
					
						
						
							
							* Start scoring tokens  
						
						
						
					 
					
						2015-06-28 06:21:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5af500909c 
							
						 
					 
					
						
						
							
							* Remove unused directve from parser.pyx  
						
						
						
					 
					
						2015-06-28 06:20:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d5b4090705 
							
						 
					 
					
						
						
							
							* Add profile directive  
						
						
						
					 
					
						2015-06-28 06:19:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2b5421e60c 
							
						 
					 
					
						
						
							
							* Add profile directive  
						
						
						
					 
					
						2015-06-28 06:07:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8b5de4a411 
							
						 
					 
					
						
						
							
							* Add word / tag / label sets, for use in neural net  
						
						
						
					 
					
						2015-06-28 05:46:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cfcbd8d256 
							
						 
					 
					
						
						
							
							* Fix punctuation eval in scorer.py  
						
						
						
					 
					
						2015-06-28 01:31:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ed40a8380e 
							
						 
					 
					
						
						
							
							* Remove hard-coding of vector lengths  
						
						
						
					 
					
						2015-06-27 04:18:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ebe630cc8d 
							
						 
					 
					
						
						
							
							* Enable more features for NN  
						
						
						
					 
					
						2015-06-27 04:17:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8bb43475e 
							
						 
					 
					
						
						
							
							* Bridge to Theano working. Very disorganised. Using thinc adb60aba966ed2  
						
						
						
					 
					
						2015-06-27 02:39:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2fe98b8a9a 
							
						 
					 
					
						
						
							
							* Prepare for new models to be plugged in by using Example class  
						
						
						
					 
					
						2015-06-26 13:51:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6896455884 
							
						 
					 
					
						
						
							
							* Rejig parser interface to use new thinc.api.Example class, in prep of theano model. Comment out beam search  
						
						
						
					 
					
						2015-06-26 06:25:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b266a63f2c 
							
						 
					 
					
						
						
							
							* Inc version of downloadble data  
						
						
						
					 
					
						2015-06-24 04:53:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							02b171ee67 
							
						 
					 
					
						
						
							
							* Bug fixes to edge calculation  
						
						
						
					 
					
						2015-06-24 04:28:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a4e9bdf4c1 
							
						 
					 
					
						
						
							
							* Work on a theano-driven model for the parser  
						
						
						
					 
					
						2015-06-24 01:02:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7f9384f53c 
							
						 
					 
					
						
						
							
							* Remove deprecated _state module  
						
						
						
					 
					
						2015-06-23 17:28:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6dbe182491 
							
						 
					 
					
						
						
							
							* Fix merge conflicts  
						
						
						
					 
					
						2015-06-23 17:28:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							579735a095 
							
						 
					 
					
						
						
							
							* Remove import of _state module  
						
						
						
					 
					
						2015-06-23 17:25:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							88f55d136b 
							
						 
					 
					
						
						
							
							* Remove deprecated _state module  
						
						
						
					 
					
						2015-06-23 17:19:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9ab9dd2bf7 
							
						 
					 
					
						
						
							
							* Clean up unused orig_arc_eager and tree_arc_eager modules, which were only added for EMNLP experiments  
						
						
						
					 
					
						2015-06-23 17:17:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7ebfe4b983 
							
						 
					 
					
						
						
							
							* Fixes to edge features  
						
						
						
					 
					
						2015-06-23 16:32:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7b125f5a86 
							
						 
					 
					
						
						
							
							* Fixes to edge features  
						
						
						
					 
					
						2015-06-23 16:31:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8d4bbacfc5 
							
						 
					 
					
						
						
							
							* Fix edge navigation in Token objects  
						
						
						
					 
					
						2015-06-23 16:07:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							35c290bee4 
							
						 
					 
					
						
						
							
							* Fix edge features  
						
						
						
					 
					
						2015-06-23 15:50:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							221e2e485f 
							
						 
					 
					
						
						
							
							* Assign 'ROOT' as label, not 'root'  
						
						
						
					 
					
						2015-06-23 15:09:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a7bf7b0626 
							
						 
					 
					
						
						
							
							* Rename sent_start to sent_end, to reflect its new usage in the Break transition  
						
						
						
					 
					
						2015-06-23 05:39:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ee3e56f27b 
							
						 
					 
					
						
						
							
							* Fix bounds checking on entities  
						
						
						
					 
					
						2015-06-23 04:35:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							43ef5ddea5 
							
						 
					 
					
						
						
							
							* Ensure root albel is spelled ROOT, for backwards compatibility  
						
						
						
					 
					
						2015-06-23 04:14:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							065c2e1d2d 
							
						 
					 
					
						
						
							
							* Add some bounds checking around state arrays  
						
						
						
					 
					
						2015-06-23 04:13:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							89ae218b75 
							
						 
					 
					
						
						
							
							* Add import to tokens.pyx from weird Cython compiler issue with casting from memory views  
						
						
						
					 
					
						2015-06-23 03:04:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f01b3d043e 
							
						 
					 
					
						
						
							
							* Add padding to arrays in stateclass. May be papering over a deeper bug.  
						
						
						
					 
					
						2015-06-23 03:03:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5e94b5d581 
							
						 
					 
					
						
						
							
							* Have Tokens return proper numpy arrays, not Cython views.  
						
						
						
					 
					
						2015-06-23 00:07:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							69507bc729 
							
						 
					 
					
						
						
							
							* Re-enable Break transition in arc_eager.pyx  
						
						
						
					 
					
						2015-06-23 00:03:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cc579ed429 
							
						 
					 
					
						
						
							
							* Add __len__ function to StringStore  
						
						
						
					 
					
						2015-06-23 00:02:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							46fb24e9fd 
							
						 
					 
					
						
						
							
							* Add cycle-checking code in gold.pyx  
						
						
						
					 
					
						2015-06-23 00:02:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							60d26243e3 
							
						 
					 
					
						
						
							
							* Fix head alignment in read_conll.parse, which was causing corrupt parses when strip_bad_periods=True. A similar problem may apply to other data readers.  
						
						
						
					 
					
						2015-06-18 16:35:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f868175e43 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-06-16 23:37:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab110be125 
							
						 
					 
					
						
						
							
							* Remove debugging in parser.pyx  
						
						
						
					 
					
						2015-06-16 23:37:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9b13d11ab3 
							
						 
					 
					
						
						
							
							* Fix handling of entities in StateClass  
						
						
						
					 
					
						2015-06-16 23:35:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c40a2c661c 
							
						 
					 
					
						
						
							
							* Add tree_arc_eager  
						
						
						
					 
					
						2015-06-15 08:23:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5da5cf7084 
							
						 
					 
					
						
						
							
							* Add some more features for S1/S0  
						
						
						
					 
					
						2015-06-15 04:07:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8156a01bca 
							
						 
					 
					
						
						
							
							* Fix root label for orig_arc_eager  
						
						
						
					 
					
						2015-06-15 02:54:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							21930ede15 
							
						 
					 
					
						
						
							
							* Switch toggle on USE_ROOT_ARC_SEGMENT  
						
						
						
					 
					
						2015-06-15 02:54:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							38a6afa484 
							
						 
					 
					
						
						
							
							* Make possibly dubious correction to the unshift oracle  
						
						
						
					 
					
						2015-06-15 02:50:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f66228f253 
							
						 
					 
					
						
						
							
							* Add some more features, esp for labels  
						
						
						
					 
					
						2015-06-14 21:18:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3da8e0f317 
							
						 
					 
					
						
						
							
							* Add orig_arc_eager  
						
						
						
					 
					
						2015-06-14 20:31:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea8a103007 
							
						 
					 
					
						
						
							
							* Fix import of TransitionSystem in parser.pyx  
						
						
						
					 
					
						2015-06-14 19:01:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e0984ca139 
							
						 
					 
					
						
						
							
							* Fix valency features in StateClass  
						
						
						
					 
					
						2015-06-14 17:50:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e50ac1a47f 
							
						 
					 
					
						
						
							
							* Add verbose printing to scorer  
						
						
						
					 
					
						2015-06-14 17:45:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							763cbd23d5 
							
						 
					 
					
						
						
							
							* Upd stateclass.print_state  
						
						
						
					 
					
						2015-06-14 17:44:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bdd07bf000 
							
						 
					 
					
						
						
							
							* Fix Break oracle, but disable the Break transition for now, while we finalize the gold-standard experiments  
						
						
						
					 
					
						2015-06-14 17:44:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							399f15fbdf 
							
						 
					 
					
						
						
							
							* Add flag to toggle handling of multi-root inputs without the Break transition. Clear up now unused best_valid stuff.  
						
						
						
					 
					
						2015-06-14 00:28:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							75289b4761 
							
						 
					 
					
						
						
							
							* Don't refuse to parse single token sentences, incase some transition system needs them, e.g. single word entity. Instead fix error in _init_state.  
						
						
						
					 
					
						2015-06-13 22:55:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							77d7e79c7e 
							
						 
					 
					
						
						
							
							* Fix r/l and distance features.  
						
						
						
					 
					
						2015-06-12 13:06:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b643cb3d5c 
							
						 
					 
					
						
						
							
							* Allow training documents to be filtered in gold.pyx  
						
						
						
					 
					
						2015-06-12 02:42:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							15e177d7a1 
							
						 
					 
					
						
						
							
							* Fixes to unshift/fast-forward strategy. Getting 91.55 greedy on NW dev, gold preproc  
						
						
						
					 
					
						2015-06-12 01:50:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							afd77a529b 
							
						 
					 
					
						
						
							
							* Prepare for break transition, with fast-forwarding. 86.5 on 1k nw gold preproc  
						
						
						
					 
					
						2015-06-10 14:08:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							495f528709 
							
						 
					 
					
						
						
							
							* Add support for sentence breaks in stateclass  
						
						
						
					 
					
						2015-06-10 12:34:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b7b18c279d 
							
						 
					 
					
						
						
							
							* Fix Reduce oracle. Getting 86.35  
						
						
						
					 
					
						2015-06-10 11:33:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb09b5d91a 
							
						 
					 
					
						
						
							
							* Fix shifted bit vector in stateclass --- should reflect whether the word has been *unshifted*.  
						
						
						
					 
					
						2015-06-10 11:33:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aa9625f688 
							
						 
					 
					
						
						
							
							* Do non-monotonic Unshift. Every word can be shifted at most 1 time. When the Reduce move is used, if S0 has no head, we put the word back on the buffer. Gets 86.4 on nw 1k with gold pre-proc. Break transition not yet implemented for this.  
						
						
						
					 
					
						2015-06-10 10:15:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7bf6b7de3e 
							
						 
					 
					
						
						
							
							* Add unshift action to StateClass, and track which moves have been shifted  
						
						
						
					 
					
						2015-06-10 10:13:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7c8069e65 
							
						 
					 
					
						
						
							
							* Fix bug in distance feature  
						
						
						
					 
					
						2015-06-10 10:12:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abd07c067a 
							
						 
					 
					
						
						
							
							* Inline B and S methods on stateclass  
						
						
						
					 
					
						2015-06-10 07:22:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e2f9a80713 
							
						 
					 
					
						
						
							
							* Remove old _state imports  
						
						
						
					 
					
						2015-06-10 07:09:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e9aaecc619 
							
						 
					 
					
						
						
							
							* Remove from_struct method from StateClass  
						
						
						
					 
					
						2015-06-10 06:58:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							18cc326dc0 
							
						 
					 
					
						
						
							
							* Bug fixes to ner.pyx  
						
						
						
					 
					
						2015-06-10 06:57:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e5570c9700 
							
						 
					 
					
						
						
							
							* Set nogil for oracle functions  
						
						
						
					 
					
						2015-06-10 06:56:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4575e7a60f 
							
						 
					 
					
						
						
							
							* Fix beam search with new StateClass  
						
						
						
					 
					
						2015-06-10 06:33:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							04b1cd9b8c 
							
						 
					 
					
						
						
							
							* Greedy parsing working with new StateClass. Beam parsing broken  
						
						
						
					 
					
						2015-06-10 04:20:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a94b64eca 
							
						 
					 
					
						
						
							
							* Remove State* from parser.pyx entirely, switching over to StateClass. Beam parsing still untested.  
						
						
						
					 
					
						2015-06-10 02:03:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f14a1526aa 
							
						 
					 
					
						
						
							
							* Remove version of fill_context that takes State*  
						
						
						
					 
					
						2015-06-10 01:39:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d68c686ec1 
							
						 
					 
					
						
						
							
							* Move StateClass into interface of transition functions  
						
						
						
					 
					
						2015-06-10 01:35:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4b98b3e9c8 
							
						 
					 
					
						
						
							
							* Cost functions now take StateClass argument, instead of State*.  
						
						
						
					 
					
						2015-06-10 00:40:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e0cf61f591 
							
						 
					 
					
						
						
							
							* Move StateClass into the interface for is_valid  
						
						
						
					 
					
						2015-06-09 23:23:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0895d454fb 
							
						 
					 
					
						
						
							
							* Prepare to switch to using state class, instead of state struct  
						
						
						
					 
					
						2015-06-09 21:20:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2b9629ed62 
							
						 
					 
					
						
						
							
							* Begin adding stateclass to ArcEager  
						
						
						
					 
					
						2015-06-09 01:41:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ba10fd8af5 
							
						 
					 
					
						
						
							
							* Add StateClass, to replace/refactor the mess in _state  
						
						
						
					 
					
						2015-06-09 01:39:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c7e3dfc1dc 
							
						 
					 
					
						
						
							
							* Don't automatically push words when stack is empty, as it messes up beam parsing. Add hash method to beam state.  
						
						
						
					 
					
						2015-06-08 14:49:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							00a0dfcb59 
							
						 
					 
					
						
						
							
							* Avoid shipping the spacy.munge package  
						
						
						
					 
					
						2015-06-08 00:54:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7d265a9c62 
							
						 
					 
					
						
						
							
							* Revert to wget in spacy.en.download  
						
						
						
					 
					
						2015-06-08 00:48:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a8fc5f1285 
							
						 
					 
					
						
						
							
							* Fix munge/read_ner  
						
						
						
					 
					
						2015-06-08 00:35:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1515862861 
							
						 
					 
					
						
						
							
							* Fix download.py  
						
						
						
					 
					
						2015-06-08 00:08:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7e9e8f654a 
							
						 
					 
					
						
						
							
							* Use urllib in spacy.en.download  
						
						
						
					 
					
						2015-06-07 23:51:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							80cff41a9c 
							
						 
					 
					
						
						
							
							* Upd download.py  
						
						
						
					 
					
						2015-06-07 19:13:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6e2564239d 
							
						 
					 
					
						
						
							
							* Bug fixes to beam parser. Search still broken on non-gold sentences  
						
						
						
					 
					
						2015-06-07 19:12:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1ec4e6fc95 
							
						 
					 
					
						
						
							
							* Don't score whitespace tokens  
						
						
						
					 
					
						2015-06-07 19:10:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							731e5f1e46 
							
						 
					 
					
						
						
							
							* Add get() function in spacy/syntax/Config  
						
						
						
					 
					
						2015-06-07 19:09:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8f142c1838 
							
						 
					 
					
						
						
							
							* Refactor transition system oracles, to split out move and label cost. Preparing to add Unshift move. Will exclude non-monotonic.  
						
						
						
					 
					
						2015-06-07 03:21:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							89b8775887 
							
						 
					 
					
						
						
							
							* Fix output from _min_edit_path when inputs match.  
						
						
						
					 
					
						2015-06-06 05:58:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							98cfd84123 
							
						 
					 
					
						
						
							
							* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work  
						
						
						
					 
					
						2015-06-06 05:57:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1fee7ade61 
							
						 
					 
					
						
						
							
							* Tweak to ner  
						
						
						
					 
					
						2015-06-05 23:48:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							33e70b167f 
							
						 
					 
					
						
						
							
							* Remove dead code from ner.pyx  
						
						
						
					 
					
						2015-06-05 17:12:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							88ac5c6e98 
							
						 
					 
					
						
						
							
							* Send beam_width < 0 to greedy parser  
						
						
						
					 
					
						2015-06-05 17:12:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0114e7600d 
							
						 
					 
					
						
						
							
							* Fix NER oracle  
						
						
						
					 
					
						2015-06-05 17:11:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c04e6ebca6 
							
						 
					 
					
						
						
							
							* Allow user to load different sized vectors.  
						
						
						
					 
					
						2015-06-05 16:26:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6bf35cecc3 
							
						 
					 
					
						
						
							
							* Refactor transition system to use classes with staticmethods.  
						
						
						
					 
					
						2015-06-05 02:27:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							36a34d544b 
							
						 
					 
					
						
						
							
							* Refactoring arc_eager, grouping oracle functions into transitions  
						
						
						
					 
					
						2015-06-04 22:43:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4433396005 
							
						 
					 
					
						
						
							
							* Impove efficiency of dynamic oracle, making beam training faster  
						
						
						
					 
					
						2015-06-04 21:15:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							079dad28a7 
							
						 
					 
					
						
						
							
							* Update for faster beam training  
						
						
						
					 
					
						2015-06-04 19:32:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f8843906ad 
							
						 
					 
					
						
						
							
							Merge branch 'constituency'  
						
						... 
						
						
						
						Add beam parsing and training from JSON files, with Levenshtein alignment. 
						
					 
					
						2015-06-03 06:07:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ae653b850a 
							
						 
					 
					
						
						
							
							* Remove unused import from gold.pyx  
						
						
						
					 
					
						2015-06-03 06:07:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a2627b6102 
							
						 
					 
					
						
						
							
							* Fix bug in refactored init_transition  
						
						
						
					 
					
						2015-06-03 06:01:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dd0867645d 
							
						 
					 
					
						
						
							
							* Remove stray const from State header  
						
						
						
					 
					
						2015-06-03 00:10:04 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c47b10a6e 
							
						 
					 
					
						
						
							
							* Make optimization to children_in_buffer: stop searching when we would cross a bracket.  
						
						
						
					 
					
						2015-06-02 21:05:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a513ec500f 
							
						 
					 
					
						
						
							
							* Have oracle functions take a struct instead of a Python object  
						
						
						
					 
					
						2015-06-02 20:01:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d1b55310a1 
							
						 
					 
					
						
						
							
							* Refactor _advance_beam function  
						
						
						
					 
					
						2015-06-02 18:38:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0786d9b3c7 
							
						 
					 
					
						
						
							
							* Refactor TransitionSystem, adding set_valid method  
						
						
						
					 
					
						2015-06-02 18:38:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bd82a49994 
							
						 
					 
					
						
						
							
							* Add set_scores method to Model  
						
						
						
					 
					
						2015-06-02 18:37:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a3964957f6 
							
						 
					 
					
						
						
							
							* Add profiling for _state.pyx  
						
						
						
					 
					
						2015-06-02 18:36:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e822df0867 
							
						 
					 
					
						
						
							
							* Fix bugs in new greedy/beam parser  
						
						
						
					 
					
						2015-06-02 02:01:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							66dfa95847 
							
						 
					 
					
						
						
							
							* Revise greedy_parse/beam_parse ownership goof  
						
						
						
					 
					
						2015-06-02 01:34:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							75658b2ed3 
							
						 
					 
					
						
						
							
							* Remove use of new beam.loss property, to maintain compatibility with older versions of thinc for now.  
						
						
						
					 
					
						2015-06-02 00:57:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7c29362d60 
							
						 
					 
					
						
						
							
							* Rename parser class in parser.pxd, now that beam parsing is supported  
						
						
						
					 
					
						2015-06-02 00:53:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							58d5ac0944 
							
						 
					 
					
						
						
							
							* Add beam search capabilities to Parser. Rename GreedyParser to Parser.  
						
						
						
					 
					
						2015-06-02 00:28:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							62424e6c76 
							
						 
					 
					
						
						
							
							* Remove unused regularize argument from _ml.Model  
						
						
						
					 
					
						2015-06-02 00:27:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							adeb57cb1e 
							
						 
					 
					
						
						
							
							* Fix long line  
						
						
						
					 
					
						2015-06-01 23:07:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e09a08bd00 
							
						 
					 
					
						
						
							
							* Add copy_state function  
						
						
						
					 
					
						2015-06-01 23:06:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c7876aa8b6 
							
						 
					 
					
						
						
							
							* Add get_valid method  
						
						
						
					 
					
						2015-06-01 23:06:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d82f9d958d 
							
						 
					 
					
						
						
							
							* Remove regularization cruft from _ml, move score from .pxd file to .pyx  
						
						
						
					 
					
						2015-05-31 18:48:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5e99ff94c8 
							
						 
					 
					
						
						
							
							* Edits to arc eager oracle. Couldn't figure out how the non-monotonic lines made sense. They seem covered by children_in_stack  
						
						
						
					 
					
						2015-05-31 15:14:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c5632b71c 
							
						 
					 
					
						
						
							
							* Roll back proposed change to Break transition while investigate effect  
						
						
						
					 
					
						2015-05-31 06:49:52 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6bba793df3 
							
						 
					 
					
						
						
							
							* Disable the Zipf-reweighting thing while investigate effect  
						
						
						
					 
					
						2015-05-31 06:48:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e77940565d 
							
						 
					 
					
						
						
							
							* Add length cap to distance feature  
						
						
						
					 
					
						2015-05-31 05:25:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd596351ba 
							
						 
					 
					
						
						
							
							* Fix valency features  
						
						
						
					 
					
						2015-05-31 05:24:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							87d6551d19 
							
						 
					 
					
						
						
							
							* Allow gold parse to cut non-projective arcs  
						
						
						
					 
					
						2015-05-31 01:11:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c4f0914b4e 
							
						 
					 
					
						
						
							
							* Fix POS tag evaluation in scorer.py: do evaluate punctuation tags  
						
						
						
					 
					
						2015-05-30 18:24:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9e39a206da 
							
						 
					 
					
						
						
							
							* Fix efficiency of JSON reading, by using ujson instead of stream  
						
						
						
					 
					
						2015-05-30 17:54:52 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76300bbb1b 
							
						 
					 
					
						
						
							
							* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag.  
						
						
						
					 
					
						2015-05-30 01:25:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b76bbbd12c 
							
						 
					 
					
						
						
							
							* Read json files recursively from a directory, instead of requiring a single .json file  
						
						
						
					 
					
						2015-05-29 03:52:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8f31d3b864 
							
						 
					 
					
						
						
							
							* Relax constraint on Break transition for non-monotonic parsing.  
						
						
						
					 
					
						2015-05-28 23:39:52 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b2e5c4b8a 
							
						 
					 
					
						
						
							
							* Avoid NER scoring for sentences with some missing NER values.  
						
						
						
					 
					
						2015-05-28 22:39:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d25d31442d 
							
						 
					 
					
						
						
							
							* Hackishly support broken NER annotations. Should fix this.  
						
						
						
					 
					
						2015-05-27 19:14:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7a2725bca4 
							
						 
					 
					
						
						
							
							* Read input json in a streaming way  
						
						
						
					 
					
						2015-05-27 19:13:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a1c91675e 
							
						 
					 
					
						
						
							
							* Add file to read ENAMEX ner data  
						
						
						
					 
					
						2015-05-27 17:36:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							732fa7709a 
							
						 
					 
					
						
						
							
							* Edits to align_raw script, for use in prepare_treebank  
						
						
						
					 
					
						2015-05-27 04:23:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4010b9b6d9 
							
						 
					 
					
						
						
							
							* Pass parameter for regularization in parser.pyx  
						
						
						
					 
					
						2015-05-27 03:18:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4c6058baa7 
							
						 
					 
					
						
						
							
							* Fix evaluation of NER in scorer.py  
						
						
						
					 
					
						2015-05-27 03:18:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6016ee83a6 
							
						 
					 
					
						
						
							
							* Fix reading of NER in gold.pyx  
						
						
						
					 
					
						2015-05-27 03:17:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							04bda8648d 
							
						 
					 
					
						
						
							
							* Pass parameter for regularization to model  
						
						
						
					 
					
						2015-05-27 03:16:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f69fe6a635 
							
						 
					 
					
						
						
							
							* Fix heads problem in read_conll  
						
						
						
					 
					
						2015-05-27 01:14:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0eec1d12af 
							
						 
					 
					
						
						
							
							* Add comment about zipf reweighting  
						
						
						
					 
					
						2015-05-27 01:14:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4d37b66c55 
							
						 
					 
					
						
						
							
							* Make Zipf regularization a bit more efficient  
						
						
						
					 
					
						2015-05-27 01:12:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7fc24821bc 
							
						 
					 
					
						
						
							
							* Experiment with Zipfian corruptions when calculating prediction  
						
						
						
					 
					
						2015-05-26 22:17:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							eba7b34f66 
							
						 
					 
					
						
						
							
							* Add flag to disable loading of word vectors  
						
						
						
					 
					
						2015-05-25 01:02:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3593babd35 
							
						 
					 
					
						
						
							
							* Add functions for Levenshtein distance alignment  
						
						
						
					 
					
						2015-05-24 21:50:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							744f06abf5 
							
						 
					 
					
						
						
							
							* Add script to read OntoNotes source documents  
						
						
						
					 
					
						2015-05-24 21:49:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fc75210941 
							
						 
					 
					
						
						
							
							* Move spacy.syntax.conll to spacy.gold  
						
						
						
					 
					
						2015-05-24 21:35:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							765b61cac4 
							
						 
					 
					
						
						
							
							* Update spacy.scorer, to use P/R/F to support tokenization errors  
						
						
						
					 
					
						2015-05-24 20:07:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							efe7a7d7d6 
							
						 
					 
					
						
						
							
							* Clean unused functions from spacy.syntax.conll  
						
						
						
					 
					
						2015-05-24 20:06:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							78487f3e66 
							
						 
					 
					
						
						
							
							* Update parser oracle for missing heads  
						
						
						
					 
					
						2015-05-24 20:05:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1044a13413 
							
						 
					 
					
						
						
							
							* Begin refactoring scorer to use recall over gold dependencies  
						
						
						
					 
					
						2015-05-24 17:40:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							acd1245ad4 
							
						 
					 
					
						
						
							
							* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer  
						
						
						
					 
					
						2015-05-24 17:35:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							20f1d868a3 
							
						 
					 
					
						
						
							
							* Tmp commit. Working on whole document parsing  
						
						
						
					 
					
						2015-05-24 02:49:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f2ee9c4feb 
							
						 
					 
					
						
						
							
							* Comment out constituency parsing stuff, so that code compiles  
						
						
						
					 
					
						2015-05-20 16:55:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8ee7c541f1 
							
						 
					 
					
						
						
							
							* Update Constituent definition  
						
						
						
					 
					
						2015-05-20 16:03:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9dfc9c039c 
							
						 
					 
					
						
						
							
							* Work on constituency parsing.  
						
						
						
					 
					
						2015-05-20 16:02:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5a5710e711 
							
						 
					 
					
						
						
							
							* Fix Span.subtree property  
						
						
						
					 
					
						2015-05-13 21:53:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							badf030b6c 
							
						 
					 
					
						
						
							
							* Add parse navigation to Span objects  
						
						
						
					 
					
						2015-05-13 21:45:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ca320afe86 
							
						 
					 
					
						
						
							
							* Add docstring for ents attribute  
						
						
						
					 
					
						2015-05-13 21:20:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ba07b925a7 
							
						 
					 
					
						
						
							
							* Fix compile error in conll.pyx  
						
						
						
					 
					
						2015-05-12 22:33:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f1e0272b18 
							
						 
					 
					
						
						
							
							* Disable c-parsing transitions  
						
						
						
					 
					
						2015-05-12 22:33:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							03a6626545 
							
						 
					 
					
						
						
							
							* Tmp commit  
						
						
						
					 
					
						2015-05-12 20:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9568ebed08 
							
						 
					 
					
						
						
							
							* Fix off-by-one in head reading  
						
						
						
					 
					
						2015-05-12 20:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							69840d8cc3 
							
						 
					 
					
						
						
							
							* Tweak verbose output printing in scorer.py  
						
						
						
					 
					
						2015-05-12 20:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0605af6838 
							
						 
					 
					
						
						
							
							* Fix head misalignment in read_conll, when periods are ignored  
						
						
						
					 
					
						2015-05-12 20:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d2ac8d8007 
							
						 
					 
					
						
						
							
							* Add ctnt field to State, in preparation for constituency parsing  
						
						
						
					 
					
						2015-05-12 20:27:56 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab67693393 
							
						 
					 
					
						
						
							
							* Add read_json_file to conll.pyx  
						
						
						
					 
					
						2015-05-12 20:27:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aff9359a8d 
							
						 
					 
					
						
						
							
							* Update ner.pyx to expect brackets from gold_tuples  
						
						
						
					 
					
						2015-05-12 20:27:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0ad72a77ce 
							
						 
					 
					
						
						
							
							* Write JSON files, with both dependency and PSG parses  
						
						
						
					 
					
						2015-05-12 20:27:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d48218f4b2 
							
						 
					 
					
						
						
							
							* Add left_edge and right_edge properties  
						
						
						
					 
					
						2015-05-12 20:27:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							53cf77e1c8 
							
						 
					 
					
						
						
							
							* Bug fix: when non-monotonically correct a dependency, make sure to delete the old one from the child list  
						
						
						
					 
					
						2015-05-12 20:26:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a4e2af54f9 
							
						 
					 
					
						
						
							
							* Add support for l/r edge to add_dep, and move inlined methods into _state.pyx where possible  
						
						
						
					 
					
						2015-05-12 20:26:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d634038eb6 
							
						 
					 
					
						
						
							
							* Add l_edge and r_edge props in TokenC for tracking the parse-yield of the token  
						
						
						
					 
					
						2015-05-12 20:26:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							03ebf70a66 
							
						 
					 
					
						
						
							
							* Inc version to 0.84  
						
						
						
					 
					
						2015-05-12 02:38:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e73eaf2d05 
							
						 
					 
					
						
						
							
							* Replace some assertions with proper errors  
						
						
						
					 
					
						2015-05-08 16:52:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fb8d50b3d5 
							
						 
					 
					
						
						
							
							Merge branch 'master' of ssh://github.com/honnibal/spaCy  
						
						
						
					 
					
						2015-04-30 12:45:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ed8e8c3bd0 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-04-29 14:22:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							378c2a6435 
							
						 
					 
					
						
						
							
							* Fix POS model: make it use tag instead of pos in history features  
						
						
						
					 
					
						2015-04-29 00:02:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							763ef01575 
							
						 
					 
					
						
						
							
							* Fix two bugs in feature calculation  
						
						
						
					 
					
						2015-04-28 23:25:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b3fd48c97b 
							
						 
					 
					
						
						
							
							* Fix missing root labels bug identified in Issue  #57  
						
						
						
					 
					
						2015-04-28 20:45:51 +02:00 
						 
				 
			
				
					
						
							
							
								Jordan Suchow 
							
						 
					 
					
						
						
						
						
							
						
						
							3a8d9b37a6 
							
						 
					 
					
						
						
							
							Remove trailing whitespace  
						
						
						
					 
					
						2015-04-19 13:01:38 -07:00 
						 
				 
			
				
					
						
							
							
								Jordan Suchow 
							
						 
					 
					
						
						
						
						
							
						
						
							5f0f940a1f 
							
						 
					 
					
						
						
							
							Remove unused imports  
						
						
						
					 
					
						2015-04-19 01:05:22 -07:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cc4e395927 
							
						 
					 
					
						
						
							
							* Add some ad hoc regexes, for multi-word location prepositions  
						
						
						
					 
					
						2015-04-17 04:44:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f7ffd94e6a 
							
						 
					 
					
						
						
							
							* Add Token.conjuncts property  
						
						
						
					 
					
						2015-04-17 01:40:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							684d0e5e85 
							
						 
					 
					
						
						
							
							* Download updated data  
						
						
						
					 
					
						2015-04-16 04:29:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2ef170a991 
							
						 
					 
					
						
						
							
							* Fix Issue  #54 : Error merging multi-word token when there's a mid-token match.  
						
						
						
					 
					
						2015-04-16 04:28:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							42617548af 
							
						 
					 
					
						
						
							
							* Disable merge_mwes by default  
						
						
						
					 
					
						2015-04-16 04:20:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							99dbf8a38c 
							
						 
					 
					
						
						
							
							* Fix error type in lookup_transition  
						
						
						
					 
					
						2015-04-16 01:36:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							77d0700caf 
							
						 
					 
					
						
						
							
							* Add on X way regexes  
						
						
						
					 
					
						2015-04-16 01:35:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f16848b60 
							
						 
					 
					
						
						
							
							* Add (N0w, N1w) unigram pair to NER features, prompted by failure to detect 'this weekend'  
						
						
						
					 
					
						2015-04-15 06:01:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c6707778dd 
							
						 
					 
					
						
						
							
							* Fix Issue  #51 : Handle non-ascii lemmas correctly  
						
						
						
					 
					
						2015-04-13 22:28:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bf0aff5124 
							
						 
					 
					
						
						
							
							* Fix bug in Tokens.ents where entity wasn't being emitted if another started immediately after  
						
						
						
					 
					
						2015-04-13 21:34:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2b84a90bbb 
							
						 
					 
					
						
						
							
							* Fix Issue  #50 : Python 3 compatibility of v0.80  
						
						
						
					 
					
						2015-04-13 05:59:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fbd48c571d 
							
						 
					 
					
						
						
							
							* Rearrange code in tokens.pyx  
						
						
						
					 
					
						2015-04-13 05:41:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							507048dc45 
							
						 
					 
					
						
						
							
							* Rename StandardError to Exception, for Python 3 compatibility  
						
						
						
					 
					
						2015-04-12 07:28:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							761a19113a 
							
						 
					 
					
						
						
							
							* Fix /tmp moving thing in download.py  
						
						
						
					 
					
						2015-04-12 07:04:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							248a2b4b0f 
							
						 
					 
					
						
						
							
							* Remove Spans class  
						
						
						
					 
					
						2015-04-12 04:07:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1d05e6da00 
							
						 
					 
					
						
						
							
							* Add ne_iob and ne_type features to NER  
						
						
						
					 
					
						2015-04-10 19:07:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4df8a3d90f 
							
						 
					 
					
						
						
							
							* Add ne_iob and ne_type attributes to context vector  
						
						
						
					 
					
						2015-04-10 05:02:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8c354c432b 
							
						 
					 
					
						
						
							
							* Add ValueError condition to ner_tag reading  
						
						
						
					 
					
						2015-04-10 04:59:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							435cccf098 
							
						 
					 
					
						
						
							
							* Add read_conll03_file function to conll.pyx  
						
						
						
					 
					
						2015-04-10 04:59:11 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							99c9ecfc18 
							
						 
					 
					
						
						
							
							* Fix bug in prefix, suffix and word shape features in parser and NER  
						
						
						
					 
					
						2015-04-10 03:53:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cff2b13fef 
							
						 
					 
					
						
						
							
							* Fix Issue  #44 : Broken Token.string attribute when single word sentence  
						
						
						
					 
					
						2015-04-07 06:08:25 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6640386b25 
							
						 
					 
					
						
						
							
							* Fix Issue  #43 : TAG attr not supported. Also add DEP attr, while I'm at it. Need better way of ensuring future changes don't break in similar way.  
						
						
						
					 
					
						2015-04-07 06:00:57 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b64b2bd910 
							
						 
					 
					
						
						
							
							* Fix Issue  #43 : TAG attr not supported. Also add DEP attr, while I'm at it. Need better way of ensuring future changes don't break in similar way.  
						
						
						
					 
					
						2015-04-07 06:00:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f9e510a893 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-04-07 04:53:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							66c7ccf6cc 
							
						 
					 
					
						
						
							
							* Fix Spans.orth_  
						
						
						
					 
					
						2015-04-07 04:53:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b8d34531c4 
							
						 
					 
					
						
						
							
							* Add support for units to English.__init__, by loading and applying regular expressions  
						
						
						
					 
					
						2015-04-07 04:02:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0ea5af88b6 
							
						 
					 
					
						
						
							
							* Add multi-word expression RegexMatcher  
						
						
						
					 
					
						2015-04-07 03:45:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2fee67cfa3 
							
						 
					 
					
						
						
							
							* Add regular expressions for English multi-word expressions  
						
						
						
					 
					
						2015-04-07 03:45:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5a075ea3fc 
							
						 
					 
					
						
						
							
							* Ensure NER moves are available for single-word tokens  
						
						
						
					 
					
						2015-04-05 22:30:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a60a366b2c 
							
						 
					 
					
						
						
							
							* Support 'punct' dep label in conll.pyx  
						
						
						
					 
					
						2015-04-05 22:30:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							021c972137 
							
						 
					 
					
						
						
							
							* Print parse if verbose in scorer  
						
						
						
					 
					
						2015-04-05 22:29:30 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fbf19049cf 
							
						 
					 
					
						
						
							
							* Add ent_type_ property  
						
						
						
					 
					
						2015-03-31 02:01:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e70b87efeb 
							
						 
					 
					
						
						
							
							* Add merge() method to Tokens, with fairly brittle/hacky implementation, but quite easy to test. Passing minimal tests. Still need to fix left/right deps in C data  
						
						
						
					 
					
						2015-03-30 01:37:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							557856e84c 
							
						 
					 
					
						
						
							
							* Allow regular expressions to specify labels for merged spans  
						
						
						
					 
					
						2015-03-27 17:40:52 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a3af6b7c3d 
							
						 
					 
					
						
						
							
							* Left-Arc from Root, to allow non-monotonic reduce to compete with left-arc when the stack is not empty.  
						
						
						
					 
					
						2015-03-27 17:39:16 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							db5a43318c 
							
						 
					 
					
						
						
							
							* Improve print_state debug printer  
						
						
						
					 
					
						2015-03-27 17:29:58 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1705eccbbe 
							
						 
					 
					
						
						
							
							* Remove whitespace  
						
						
						
					 
					
						2015-03-27 15:22:39 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3feb52374c 
							
						 
					 
					
						
						
							
							* Break apart a condition, for ease of debug printing  
						
						
						
					 
					
						2015-03-27 15:21:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b32f581acb 
							
						 
					 
					
						
						
							
							* Fix bug in ArcEager.get_labels  
						
						
						
					 
					
						2015-03-27 15:21:06 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5f2a4ff36d 
							
						 
					 
					
						
						
							
							* Fix spans.lemma_  
						
						
						
					 
					
						2015-03-26 16:45:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f4cc222ec3 
							
						 
					 
					
						
						
							
							* Fix NER scoring  
						
						
						
					 
					
						2015-03-26 16:45:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1320bd19db 
							
						 
					 
					
						
						
							
							* Move Span class to own file  
						
						
						
					 
					
						2015-03-26 16:45:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6f47a667cf 
							
						 
					 
					
						
						
							
							* Move Span class to own file  
						
						
						
					 
					
						2015-03-26 16:45:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f02c39dfaf 
							
						 
					 
					
						
						
							
							* Compare to is not None, for more robustness  
						
						
						
					 
					
						2015-03-26 16:44:48 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8f68b864c4 
							
						 
					 
					
						
						
							
							* Move Span/Spans to separate files. Currently duplicates lots of Tokens functionality. Should probably be integrated into Tokens  
						
						
						
					 
					
						2015-03-26 16:44:48 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e854ba0a13 
							
						 
					 
					
						
						
							
							* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a6085f8b9 
							
						 
					 
					
						
						
							
							* Clean up GreedyParser.train function a bit  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b3157927e6 
							
						 
					 
					
						
						
							
							* Clean up unused feature templates  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							411bf377d4 
							
						 
					 
					
						
						
							
							* Remove dependency on ner_util module  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01c892f583 
							
						 
					 
					
						
						
							
							* Add comment to fill_context  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2741179aff 
							
						 
					 
					
						
						
							
							* Important bug fix: Fill token N2w, which was being unfilled, after a bad edit while writing the NER features.  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2b2dec95d3 
							
						 
					 
					
						
						
							
							* Add comment to set_parse  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e770fade1e 
							
						 
					 
					
						
						
							
							* Don't set dependency labels in set_parse, as this may be used by the Entity recogniser instead. Need to clean this method up...  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							71648205d9 
							
						 
					 
					
						
						
							
							* Add support for debug feature set. Just use unigrams for this.  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3b70b304b2 
							
						 
					 
					
						
						
							
							* Add words to gold_tuples from gold conll file  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e12dec76e 
							
						 
					 
					
						
						
							
							* Adjust scorer to account for tokenization mistakes  
						
						
						
					 
					
						2015-03-26 16:44:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							05d6065e2e 
							
						 
					 
					
						
						
							
							* Add assertion  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							377e9b29b1 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							670959f40c 
							
						 
					 
					
						
						
							
							* Fix iteration order on Tokens.rights  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							231ce2dae5 
							
						 
					 
					
						
						
							
							* Assign ROOT label by default. May be papering over another bug.  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f4ad8fdfb 
							
						 
					 
					
						
						
							
							* Assign root words the ROOT label via the Break transition. Something is still wrong here...  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f729164c01 
							
						 
					 
					
						
						
							
							* Fix bug in label assignment: ensure null-label transitions receive the label 0  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7237c805c7 
							
						 
					 
					
						
						
							
							* Load tag for specials.json token  
						
						
						
					 
					
						2015-03-26 16:44:46 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							567388e38d 
							
						 
					 
					
						
						
							
							* Use values encoded by StringStore in POS tagging, rather than indices into a list of tags  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3105c7f8ba 
							
						 
					 
					
						
						
							
							* Don't pass label_ids dict to Tokens, since we now use the StringStore to manage string-to-int mapping for labels  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							801bf14f4f 
							
						 
					 
					
						
						
							
							* Clean up handling of dep_strings and ent_strings, using StringStore to encode the label names.  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							31fad99518 
							
						 
					 
					
						
						
							
							* Use StringStore to encode label names, instead of label_ids  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							64db61bff1 
							
						 
					 
					
						
						
							
							* Add Span class to Python API  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b9b695fb1b 
							
						 
					 
					
						
						
							
							* Remove debug word list  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f21ab2d7fb 
							
						 
					 
					
						
						
							
							* Fix bug in ugly ent_strings hack on English class  
						
						
						
					 
					
						2015-03-26 16:44:45 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1c843934be 
							
						 
					 
					
						
						
							
							* Fix oracle bug in NER. Now getting 77% F on ontonotes  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							903f196b3f 
							
						 
					 
					
						
						
							
							* Fix verbose printing for scorer  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e181c051d5 
							
						 
					 
					
						
						
							
							* Improve features for NER  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7ecb52c0ed 
							
						 
					 
					
						
						
							
							* Add scorer script  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8057a95f20 
							
						 
					 
					
						
						
							
							* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring.  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ae235e07b9 
							
						 
					 
					
						
						
							
							* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b3eda03c9c 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							220ce8bfed 
							
						 
					 
					
						
						
							
							* Prepare English class for NER  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5830dc1c1 
							
						 
					 
					
						
						
							
							* Remove _transitions.pyx  
						
						
						
					 
					
						2015-03-26 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6865c2fb4d 
							
						 
					 
					
						
						
							
							* Fix assignment of dep strings in tokens.pyx  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b6bce9e7a 
							
						 
					 
					
						
						
							
							* Fix label loading for transition system  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5278c7504b 
							
						 
					 
					
						
						
							
							* Hacks to conll.pyx. Should clean these up.  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f321b2b2eb 
							
						 
					 
					
						
						
							
							* Remove TODO comment  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fdabd93bfb 
							
						 
					 
					
						
						
							
							* Ensure high loss for invalid moves, and fix label reading for arc-eager  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							10ed738df2 
							
						 
					 
					
						
						
							
							* Tmp commit  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4f83c9b3d5 
							
						 
					 
					
						
						
							
							* Make costs label-sensitive  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							179b7eb0a7 
							
						 
					 
					
						
						
							
							* Specify parser transition system in language  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8c883cef58 
							
						 
					 
					
						
						
							
							* Refactored transition system code now compiling. Still need to hook up label oracle, and test  
						
						
						
					 
					
						2015-03-26 16:44:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f0159ab4b6 
							
						 
					 
					
						
						
							
							* Add file to hold GoldParse class  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8eadb984cb 
							
						 
					 
					
						
						
							
							* Refactor arc_eager to use new TransitionSystem base class. Need to fix oracle  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b063001596 
							
						 
					 
					
						
						
							
							* Add base TransitionSystem class. Still need to rethink how non-monotonic labelling will work for best_valid  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01bc4d6815 
							
						 
					 
					
						
						
							
							* Add set_parse method, to assign parse to tokens in a less hacky way.  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dc986dbc0b 
							
						 
					 
					
						
						
							
							* Work on refactored parser, where TransitionSystem can be easily subclassed  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1cc6329b18 
							
						 
					 
					
						
						
							
							* Add base class to do transitions  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							135756ac3d 
							
						 
					 
					
						
						
							
							* Tmp commit of NER refactoring  
						
						
						
					 
					
						2015-03-26 16:44:42 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							23c1f6fc04 
							
						 
					 
					
						
						
							
							* Merge changes from stash  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0ff078876a 
							
						 
					 
					
						
						
							
							* Commit some work on ner.yx done on the plane  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d81b7be6a2 
							
						 
					 
					
						
						
							
							* Merge train.py  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e3dc3dfe2 
							
						 
					 
					
						
						
							
							* Merge changes in tokens.pyx  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8cc3524dc9 
							
						 
					 
					
						
						
							
							* Ws  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d0570685c 
							
						 
					 
					
						
						
							
							* Add NER transition system  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							043b758cf4 
							
						 
					 
					
						
						
							
							* Resurrect old NER code. This version won't be the one that runs; we want to re-use the parser code. But for now this is a useful reference.  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b139aa92ba 
							
						 
					 
					
						
						
							
							* Start setting out how NER will be implemented in the data model  
						
						
						
					 
					
						2015-03-26 16:44:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0962ffc095 
							
						 
					 
					
						
						
							
							* Fix issue  #37 : missing check_flag attribute from Token class  
						
						
						
					 
					
						2015-03-26 15:06:26 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e8d0e5d45 
							
						 
					 
					
						
						
							
							* Upd download script  
						
						
						
					 
					
						2015-03-03 05:47:16 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dbe26f5793 
							
						 
					 
					
						
						
							
							* Add children and subtree methods to Token, which are generators to assist parse-tree navigation.  
						
						
						
					 
					
						2015-03-03 04:18:41 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea90d136e8 
							
						 
					 
					
						
						
							
							* Fix bug in labelled parsing, that caused an 8% drop in labelled accuracy.  
						
						
						
					 
					
						2015-02-27 03:56:10 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							caf046b220 
							
						 
					 
					
						
						
							
							* Hastily add method to apply tags from a list of strings, instead of predicting the tags.  
						
						
						
					 
					
						2015-02-23 15:40:17 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cae077b583 
							
						 
					 
					
						
						
							
							* Work on fixing orphaned Token objects bug  
						
						
						
					 
					
						2015-02-16 15:20:31 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7572e31f5e 
							
						 
					 
					
						
						
							
							* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.  
						
						
						
					 
					
						2015-02-11 18:05:06 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							64645a1c2f 
							
						 
					 
					
						
						
							
							* Improve docstring on English  
						
						
						
					 
					
						2015-02-11 15:13:20 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							594e50bd45 
							
						 
					 
					
						
						
							
							* Add option to download speech-parsing data set.  
						
						
						
					 
					
						2015-02-11 14:20:29 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0b7e769211 
							
						 
					 
					
						
						
							
							* Add POS tags to support SWBD tag set  
						
						
						
					 
					
						2015-02-11 14:08:28 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							312b3a45f3 
							
						 
					 
					
						
						
							
							* Fix issue  #19 : Allow parsing/pos tagging of empty strings  
						
						
						
					 
					
						2015-02-10 10:15:58 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a0615104b 
							
						 
					 
					
						
						
							
							* Upd download script  
						
						
						
					 
					
						2015-02-09 10:22:59 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5c3513583d 
							
						 
					 
					
						
						
							
							* Clear buffered python tokens when modifying the Tokens object. Need to clean this up, and modify via a method on Tokens.  
						
						
						
					 
					
						2015-02-09 03:57:10 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							be5536d239 
							
						 
					 
					
						
						
							
							* Fix Issue  #22 : PRP and PRP$ were mapped to NOUN. Should be PRON.  
						
						
						
					 
					
						2015-02-08 18:36:18 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0492cee8b4 
							
						 
					 
					
						
						
							
							* Fix Issue  #24 : Lemmas are empty when the L field is missing for special-cased tokens  
						
						
						
					 
					
						2015-02-08 18:30:30 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d229fbd228 
							
						 
					 
					
						
						
							
							* Give better error on out-of-bounds array access  
						
						
						
					 
					
						2015-02-07 12:59:12 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab8bb047d0 
							
						 
					 
					
						
						
							
							* Fix negative index for __getitem__  
						
						
						
					 
					
						2015-02-07 12:58:46 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							44c7eafe44 
							
						 
					 
					
						
						
							
							* Fix download.py  
						
						
						
					 
					
						2015-02-07 12:00:36 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ca7f2eedc 
							
						 
					 
					
						
						
							
							* Upd download script  
						
						
						
					 
					
						2015-02-07 11:32:33 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f0e0588833 
							
						 
					 
					
						
						
							
							* Fill L2 norm attribute on LexemeC struct  
						
						
						
					 
					
						2015-02-07 08:44:42 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							75f9b7d6bf 
							
						 
					 
					
						
						
							
							* Add L2 norm field to LexemeC struct  
						
						
						
					 
					
						2015-02-07 08:43:17 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							51b618d646 
							
						 
					 
					
						
						
							
							* Add a has_repvec property to Lexeme, and a check function to check flags  
						
						
						
					 
					
						2015-02-07 08:42:44 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							321b402739 
							
						 
					 
					
						
						
							
							* Store the l2 norm of the word's vector  
						
						
						
					 
					
						2015-02-07 08:42:16 -05:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c7d8644149 
							
						 
					 
					
						
						
							
							* Fix regression on 'prob' attr of Token.  
						
						
						
					 
					
						2015-02-03 03:32:18 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c55a33d045 
							
						 
					 
					
						
						
							
							* Catch oracle errors  
						
						
						
					 
					
						2015-02-02 23:02:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							de772088e6 
							
						 
					 
					
						
						
							
							* Use parse tree for sbd in Tokens.sents  
						
						
						
					 
					
						2015-02-02 12:17:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							56c2ef2982 
							
						 
					 
					
						
						
							
							* Tweak POS features for web text  
						
						
						
					 
					
						2015-02-02 11:59:36 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d68678a93e 
							
						 
					 
					
						
						
							
							* Add Exception class, OracleError  
						
						
						
					 
					
						2015-02-02 11:57:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a20fdbd8ee 
							
						 
					 
					
						
						
							
							* Upd download script  
						
						
						
					 
					
						2015-02-01 13:22:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76d9394cb4 
							
						 
					 
					
						
						
							
							* Fix vocab.pyx for Python3  
						
						
						
					 
					
						2015-02-01 13:14:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							63abdf154c 
							
						 
					 
					
						
						
							
							* Hastily hack download file  
						
						
						
					 
					
						2015-01-31 22:48:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7de00c5a79 
							
						 
					 
					
						
						
							
							* Try not holding a reference to Pool, since that seems to confuse the GC  
						
						
						
					 
					
						2015-01-31 22:10:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce3ae8b5d9 
							
						 
					 
					
						
						
							
							* Fix platform-specific lexicon bug.  
						
						
						
					 
					
						2015-01-31 16:38:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a1ed574b7b 
							
						 
					 
					
						
						
							
							* Fix default model path for English  
						
						
						
					 
					
						2015-01-31 16:38:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							018e0bfa24 
							
						 
					 
					
						
						
							
							* Bug fixes to parse navigation  
						
						
						
					 
					
						2015-01-31 16:37:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e013555b25 
							
						 
					 
					
						
						
							
							* Add option to download script  
						
						
						
					 
					
						2015-01-31 13:51:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							08ca5c8970 
							
						 
					 
					
						
						
							
							* Add sent_end flag to TokenC struct  
						
						
						
					 
					
						2015-01-31 13:44:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							024cfd485c 
							
						 
					 
					
						
						
							
							* Pass tag_strings as a tuple, to support new Tokens API  
						
						
						
					 
					
						2015-01-31 13:43:37 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							77d62d0179 
							
						 
					 
					
						
						
							
							* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation.  
						
						
						
					 
					
						2015-01-31 13:42:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							88170e6295 
							
						 
					 
					
						
						
							
							* Supply dep_strings as a tuple, for the changed API on Tokens  
						
						
						
					 
					
						2015-01-31 13:42:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0981d68022 
							
						 
					 
					
						
						
							
							* Set a sent_end flag during parsing, for later use  
						
						
						
					 
					
						2015-01-31 13:41:46 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							251dbf24d7 
							
						 
					 
					
						
						
							
							* Fix unintialised variable error  
						
						
						
					 
					
						2015-01-30 20:46:34 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							83a4df5a1a 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-30 20:40:42 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6f9ebc2f34 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-30 20:33:19 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8b85d0bb8a 
							
						 
					 
					
						
						
							
							* Only download small data if no data dir exists  
						
						
						
					 
					
						2015-01-30 20:27:14 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1a7a1c2771 
							
						 
					 
					
						
						
							
							* Fix Issue  #16 : tokens recurse when printing  
						
						
						
					 
					
						2015-01-30 19:47:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cb95ef6934 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-30 19:28:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e578bd37bd 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-30 18:59:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df52014d12 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-30 18:36:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0f95712189 
							
						 
					 
					
						
						
							
							* Improve accuracy reporting during training  
						
						
						
					 
					
						2015-01-30 18:05:06 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b68f563c2f 
							
						 
					 
					
						
						
							
							* Fix Issue  #14 : Improve parsing API  
						
						
						
					 
					
						2015-01-30 18:04:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							998b607f65 
							
						 
					 
					
						
						
							
							* Upd download script, having it download all data if there's no data/ directory, allowing easier compilation from source  
						
						
						
					 
					
						2015-01-30 18:04:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							67d6e53a69 
							
						 
					 
					
						
						
							
							* Ensure parser and tagger function correctly when training from missing values, indicated by -1  
						
						
						
					 
					
						2015-01-30 14:08:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4ff180db74 
							
						 
					 
					
						
						
							
							* Fix off-by-one error in commit  0a7fceb 
						
						
						
					 
					
						2015-01-30 12:49:33 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0a7fcebdf7 
							
						 
					 
					
						
						
							
							* Fix Issue  #12 : Incorrect token.idx calculations for some punctuation, in the presence of token cache  
						
						
						
					 
					
						2015-01-30 12:33:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ebf7d2fab1 
							
						 
					 
					
						
						
							
							* Use non-joint sbd, for more simplicity and fewer classes  
						
						
						
					 
					
						2015-01-29 06:22:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d05c5bf141 
							
						 
					 
					
						
						
							
							* Remove comment  
						
						
						
					 
					
						2015-01-29 05:19:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							320b045daa 
							
						 
					 
					
						
						
							
							* Oracle now consistent over gold standard derivation  
						
						
						
					 
					
						2015-01-29 03:41:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f590382134 
							
						 
					 
					
						
						
							
							* Work on sbd  
						
						
						
					 
					
						2015-01-29 03:18:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1884a7a0be 
							
						 
					 
					
						
						
							
							* Attach comment with paper  
						
						
						
					 
					
						2015-01-28 03:18:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a2d6b195db 
							
						 
					 
					
						
						
							
							* Add messy Break transitions, carefully following the scheme of Dd Zhang et al (2013)  
						
						
						
					 
					
						2015-01-28 03:09:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f9ee5d9934 
							
						 
					 
					
						
						
							
							* Build a python list of word strings, for debugging  
						
						
						
					 
					
						2015-01-28 01:06:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d819101571 
							
						 
					 
					
						
						
							
							* Improve error message on oracle failure  
						
						
						
					 
					
						2015-01-28 00:58:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e6c3d3471f 
							
						 
					 
					
						
						
							
							* Tweak documentation for Tokens, and hide constructor as __cinit__  
						
						
						
					 
					
						2015-01-27 18:57:52 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c38c62d4a3 
							
						 
					 
					
						
						
							
							* Add docstring to English class  
						
						
						
					 
					
						2015-01-27 02:45:21 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d4c99f7dec 
							
						 
					 
					
						
						
							
							* Add attrs.pxd  
						
						
						
					 
					
						2015-01-26 22:22:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d4a493855e 
							
						 
					 
					
						
						
							
							* Fix error msg  
						
						
						
					 
					
						2015-01-25 23:01:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7f87716cf7 
							
						 
					 
					
						
						
							
							* Fix download script  
						
						
						
					 
					
						2015-01-25 23:01:10 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							92fb9257dd 
							
						 
					 
					
						
						
							
							* Add parts-of-speech file  
						
						
						
					 
					
						2015-01-25 22:00:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c1c3dba4cb 
							
						 
					 
					
						
						
							
							* Check whether vector files are present before trying to load them.  
						
						
						
					 
					
						2015-01-25 18:16:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5049d4c2e6 
							
						 
					 
					
						
						
							
							* Add parts_of_speech.pyx  
						
						
						
					 
					
						2015-01-25 16:32:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							12b034e3ef 
							
						 
					 
					
						
						
							
							* Move POS tag definitions to parts_of_speech.pxd  
						
						
						
					 
					
						2015-01-25 16:31:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7431c133d8 
							
						 
					 
					
						
						
							
							* Add error if try to access head and not is_parsed  
						
						
						
					 
					
						2015-01-25 15:33:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							951d06c824 
							
						 
					 
					
						
						
							
							* Silently don't parse if data is not present  
						
						
						
					 
					
						2015-01-25 14:47:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e857ab7a6 
							
						 
					 
					
						
						
							
							* Fix bug in POS tagger feature  
						
						
						
					 
					
						2015-01-25 02:20:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dd56e298e2 
							
						 
					 
					
						
						
							
							* Ensure tagging is applied if parse=True  
						
						
						
					 
					
						2015-01-25 02:19:44 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							94750819cd 
							
						 
					 
					
						
						
							
							* Set parse=True by default --- i.e. parse unless told not to.  
						
						
						
					 
					
						2015-01-25 01:28:28 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							71b95202eb 
							
						 
					 
					
						
						
							
							* Add docstring to StringStore  
						
						
						
					 
					
						2015-01-24 20:49:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6d1c08dafd 
							
						 
					 
					
						
						
							
							* Add docstring to Lexeme  
						
						
						
					 
					
						2015-01-24 20:48:34 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a97bed9359 
							
						 
					 
					
						
						
							
							* Fix POS and dependency label tag names.  Add parse and string navigation functions.  
						
						
						
					 
					
						2015-01-24 17:29:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76cd024095 
							
						 
					 
					
						
						
							
							* Add whitespace property to Token  
						
						
						
					 
					
						2015-01-24 07:41:21 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5fd72bc220 
							
						 
					 
					
						
						
							
							* Have 'string' refer to the whitespace-padded string  
						
						
						
					 
					
						2015-01-24 07:32:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fda94271af 
							
						 
					 
					
						
						
							
							* Rename NORM1 and NORM2 attrs to lower and norm  
						
						
						
					 
					
						2015-01-24 06:17:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5ed8b2b98f 
							
						 
					 
					
						
						
							
							* Rename sic to orth  
						
						
						
					 
					
						2015-01-23 02:08:25 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a27b23cc8f 
							
						 
					 
					
						
						
							
							* Have SBD return start/end indices  
						
						
						
					 
					
						2015-01-22 22:24:44 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d460c28838 
							
						 
					 
					
						
						
							
							* Rename vec to repvec  
						
						
						
					 
					
						2015-01-22 02:06:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8b9d913d97 
							
						 
					 
					
						
						
							
							* Rename vec to repvec  
						
						
						
					 
					
						2015-01-22 02:05:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9cd0b6b3e9 
							
						 
					 
					
						
						
							
							* Various tweaks to Tokens class  
						
						
						
					 
					
						2015-01-22 02:05:37 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5928d158ce 
							
						 
					 
					
						
						
							
							* Pass the string to Tokens  
						
						
						
					 
					
						2015-01-22 02:04:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							45264e356b 
							
						 
					 
					
						
						
							
							* Rename vec to repvec  
						
						
						
					 
					
						2015-01-22 02:04:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5e63c606ad 
							
						 
					 
					
						
						
							
							* Rename vec to repvec  
						
						
						
					 
					
						2015-01-22 02:03:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							56e6cf0672 
							
						 
					 
					
						
						
							
							* Add _string attr to Tokens object  
						
						
						
					 
					
						2015-01-21 18:57:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6ac60e91c 
							
						 
					 
					
						
						
							
							* Bug fixes to sentences method, and improved vector transport for tokens  
						
						
						
					 
					
						2015-01-21 18:56:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f2a229136c 
							
						 
					 
					
						
						
							
							* Fix data_dir=None argument to English class  
						
						
						
					 
					
						2015-01-21 18:27:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ef49b8c179 
							
						 
					 
					
						
						
							
							* Add stop-word flag  
						
						
						
					 
					
						2015-01-21 18:22:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6646bfc5df 
							
						 
					 
					
						
						
							
							* Add LOWER attr  
						
						
						
					 
					
						2015-01-21 18:19:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f149259bf5 
							
						 
					 
					
						
						
							
							* Fix negative indices in tokens  
						
						
						
					 
					
						2015-01-20 01:16:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b65b0c07bf 
							
						 
					 
					
						
						
							
							* Messily hook up vector in tokens  
						
						
						
					 
					
						2015-01-19 19:59:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8ff5b8bd84 
							
						 
					 
					
						
						
							
							* Add attribute for POS scheme  
						
						
						
					 
					
						2015-01-17 17:33:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c7e44140b 
							
						 
					 
					
						
						
							
							* Work on word vectors, and other stuff  
						
						
						
					 
					
						2015-01-17 16:21:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							802867e96a 
							
						 
					 
					
						
						
							
							* Revise interface to Token. Strings now have attribute names like norm1_  
						
						
						
					 
					
						2015-01-15 03:51:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7d3c40de7d 
							
						 
					 
					
						
						
							
							* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme  
						
						
						
					 
					
						2015-01-15 00:33:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0930892fc1 
							
						 
					 
					
						
						
							
							* Tmp. Working on refactor. Compiles, must hook up lexical feats.  
						
						
						
					 
					
						2015-01-14 00:03:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							46da3d74d2 
							
						 
					 
					
						
						
							
							* Tmp. Refactoring, introducing a Lexeme PyObject.  
						
						
						
					 
					
						2015-01-12 11:23:44 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce2edd6312 
							
						 
					 
					
						
						
							
							* Tmp commit. Refactoring to create a Python Lexeme class.  
						
						
						
					 
					
						2015-01-12 10:26:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aacaf1a0f0 
							
						 
					 
					
						
						
							
							* Fix parser  
						
						
						
					 
					
						2015-01-08 01:19:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9a21127bf7 
							
						 
					 
					
						
						
							
							* Fix parser, which was importing the wrong model  
						
						
						
					 
					
						2015-01-08 00:10:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6a3e39cdd1 
							
						 
					 
					
						
						
							
							* Add typedefs.pyx  
						
						
						
					 
					
						2015-01-06 04:51:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a58920cc5e 
							
						 
					 
					
						
						
							
							* Import orth.word_shape as a C module  
						
						
						
					 
					
						2015-01-06 03:18:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b68f7ef75 
							
						 
					 
					
						
						
							
							* Finally get string types right for orth function  
						
						
						
					 
					
						2015-01-06 03:17:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							90c143bd85 
							
						 
					 
					
						
						
							
							* Fix orth import  
						
						
						
					 
					
						2015-01-05 18:49:19 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7689dccd0f 
							
						 
					 
					
						
						
							
							* Remove unused import  
						
						
						
					 
					
						2015-01-05 18:48:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3f1944d688 
							
						 
					 
					
						
						
							
							* Make PyPy work  
						
						
						
					 
					
						2015-01-05 17:54:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a510d9f677 
							
						 
					 
					
						
						
							
							* Another assertion removed  
						
						
						
					 
					
						2015-01-05 13:01:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2856946a66 
							
						 
					 
					
						
						
							
							* Remove assertion that doesn't work on Python 3  
						
						
						
					 
					
						2015-01-05 12:51:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							94034f1112 
							
						 
					 
					
						
						
							
							* Fix encoding in lemmatization  
						
						
						
					 
					
						2015-01-05 11:54:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b132b3caa6 
							
						 
					 
					
						
						
							
							* Fix unicode error in lemmatizer  
						
						
						
					 
					
						2015-01-05 11:53:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							477e7fbffe 
							
						 
					 
					
						
						
							
							* Fix data reading for lemmatizer  
						
						
						
					 
					
						2015-01-05 06:01:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							58f75abaca 
							
						 
					 
					
						
						
							
							* Fix unicode error in orth  
						
						
						
					 
					
						2015-01-05 05:53:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e085d5166 
							
						 
					 
					
						
						
							
							* Fix lemmatizer for Python3  
						
						
						
					 
					
						2015-01-05 05:51:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ae7c811fd1 
							
						 
					 
					
						
						
							
							* Use Exception instead of StandardError  
						
						
						
					 
					
						2015-01-04 01:22:12 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0e4c2ba036 
							
						 
					 
					
						
						
							
							* Fix loading of special morph words  
						
						
						
					 
					
						2015-01-03 23:13:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5d41028b5 
							
						 
					 
					
						
						
							
							* Move around data files for test release  
						
						
						
					 
					
						2015-01-03 01:59:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a24321b63a 
							
						 
					 
					
						
						
							
							* Add downloader  
						
						
						
					 
					
						2015-01-02 21:44:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5d9a096e2f 
							
						 
					 
					
						
						
							
							* Some minor clean-up after HastyModel  
						
						
						
					 
					
						2014-12-31 19:46:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aafaf58cbe 
							
						 
					 
					
						
						
							
							* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile.  
						
						
						
					 
					
						2014-12-31 19:40:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bcd038e7b6 
							
						 
					 
					
						
						
							
							* Implement HastyModel  
						
						
						
					 
					
						2014-12-31 01:16:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1a075f77ff 
							
						 
					 
					
						
						
							
							* Don't over-ride pre-loaded POS tags, if set by special-cases  
						
						
						
					 
					
						2014-12-30 23:26:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							785c7ba76a 
							
						 
					 
					
						
						
							
							* Embed signature on attrs  
						
						
						
					 
					
						2014-12-30 23:25:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							30e5805656 
							
						 
					 
					
						
						
							
							* Lazy-load tagger and parser  
						
						
						
					 
					
						2014-12-30 23:25:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9976aa976e 
							
						 
					 
					
						
						
							
							* Messily fix morphology and POS tags on special tokens.  
						
						
						
					 
					
						2014-12-30 23:24:37 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c1ef3febee 
							
						 
					 
					
						
						
							
							* Embedsignature in tokens.pyx  
						
						
						
					 
					
						2014-12-30 21:22:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aac5028b6e 
							
						 
					 
					
						
						
							
							* Move tagger to _ml  
						
						
						
					 
					
						2014-12-30 21:21:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1ffb0229ed 
							
						 
					 
					
						
						
							
							* Import tokens in parser.pxd  
						
						
						
					 
					
						2014-12-30 21:21:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb0b00f819 
							
						 
					 
					
						
						
							
							* Repurporse the Tagger class as a generic Model, wrapping thinc's interface  
						
						
						
					 
					
						2014-12-30 21:20:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fe2a5e0370 
							
						 
					 
					
						
						
							
							* Work on docstrings  
						
						
						
					 
					
						2014-12-27 21:46:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bb80937544 
							
						 
					 
					
						
						
							
							* Upd docstrings  
						
						
						
					 
					
						2014-12-27 18:45:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b8b65903fc 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2014-12-24 17:42:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab61673edd 
							
						 
					 
					
						
						
							
							* Fix api of array method  
						
						
						
					 
					
						2014-12-23 15:18:48 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7708d0e24a 
							
						 
					 
					
						
						
							
							* Move lemmatizer to en dir  
						
						
						
					 
					
						2014-12-23 15:16:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							98eb4c0426 
							
						 
					 
					
						
						
							
							* Fix path to parser model  
						
						
						
					 
					
						2014-12-23 15:09:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b00bc01d8c 
							
						 
					 
					
						
						
							
							* All tests now passing for reorg  
						
						
						
					 
					
						2014-12-23 13:18:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							73f200436f 
							
						 
					 
					
						
						
							
							* Tests passing except for morphology/lemmatization stuff  
						
						
						
					 
					
						2014-12-23 11:40:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cf8d26c3d2 
							
						 
					 
					
						
						
							
							* POS tagger training working after reorg  
						
						
						
					 
					
						2014-12-22 08:54:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4c4aa2c5c9 
							
						 
					 
					
						
						
							
							* Work on train  
						
						
						
					 
					
						2014-12-22 07:25:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							61df50b598 
							
						 
					 
					
						
						
							
							* Add English-subclass POS tagger  
						
						
						
					 
					
						2014-12-21 20:59:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f3f07cab6 
							
						 
					 
					
						
						
							
							* Add attrs file for English  
						
						
						
					 
					
						2014-12-21 11:29:11 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2a89d70429 
							
						 
					 
					
						
						
							
							* Add vocab.pyx to setup, and ensure we can import spacy.en.lang  
						
						
						
					 
					
						2014-12-21 06:03:53 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b34a1325d3 
							
						 
					 
					
						
						
							
							* Everything compiling after reorg. About to start testing.  
						
						
						
					 
					
						2014-12-21 05:42:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1c1a4b868 
							
						 
					 
					
						
						
							
							* Tmp  
						
						
						
					 
					
						2014-12-21 05:36:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d11c1edf8c 
							
						 
					 
					
						
						
							
							* Import slice_unicode from strings.pyx  
						
						
						
					 
					
						2014-12-20 07:56:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							be1bdcbd85 
							
						 
					 
					
						
						
							
							* Move lang.pyx to tokenizer.pyx  
						
						
						
					 
					
						2014-12-20 07:55:40 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							89a1cc1a48 
							
						 
					 
					
						
						
							
							* Move murmurhash to .pxd in strings file  
						
						
						
					 
					
						2014-12-20 07:41:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d5a942c4a4 
							
						 
					 
					
						
						
							
							* Rename lang.pyx to tokenizer.pyx  
						
						
						
					 
					
						2014-12-20 07:30:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a60ae261ae 
							
						 
					 
					
						
						
							
							* Move tokenizer to its own file, and refactor  
						
						
						
					 
					
						2014-12-20 07:29:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							867a4a000c 
							
						 
					 
					
						
						
							
							* Export set_morph_from_dict function  
						
						
						
					 
					
						2014-12-20 07:28:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e30195c6d 
							
						 
					 
					
						
						
							
							* Refactor morphology.pyx  
						
						
						
					 
					
						2014-12-20 07:27:28 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4c6ce7ee84 
							
						 
					 
					
						
						
							
							* Update tokens.pyx as part of reorg  
						
						
						
					 
					
						2014-12-20 07:03:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							116f7f3bc1 
							
						 
					 
					
						
						
							
							* Rename Lexicon to Vocab, and move it to its own file  
						
						
						
					 
					
						2014-12-20 06:54:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							780cbd68b1 
							
						 
					 
					
						
						
							
							* Move all struct definitions to structs.pxd, to avoid circular dependencies  
						
						
						
					 
					
						2014-12-20 06:51:33 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f6556d8e5d 
							
						 
					 
					
						
						
							
							* Refactor, move Lexeme struct to structs.pxd  
						
						
						
					 
					
						2014-12-20 06:51:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7d48bba6c4 
							
						 
					 
					
						
						
							
							* Move StringStore class to its own file  
						
						
						
					 
					
						2014-12-20 06:42:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b066102d2d 
							
						 
					 
					
						
						
							
							* Remove POS cache for now  
						
						
						
					 
					
						2014-12-20 03:49:58 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff252dd535 
							
						 
					 
					
						
						
							
							* Clean up 'guess_cache' idea, which didnt work well enough  
						
						
						
					 
					
						2014-12-20 03:49:11 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9d3ca13909 
							
						 
					 
					
						
						
							
							* Start work on parse-tree iteration classes  
						
						
						
					 
					
						2014-12-20 03:48:10 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bed680c632 
							
						 
					 
					
						
						
							
							* Remove commented-out features  
						
						
						
					 
					
						2014-12-20 03:47:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d178c03ae 
							
						 
					 
					
						
						
							
							* Prune the features a bit  
						
						
						
					 
					
						2014-12-20 02:46:14 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a0408e1758 
							
						 
					 
					
						
						
							
							* Working DecisionMemory class  
						
						
						
					 
					
						2014-12-20 01:43:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7920ea72b4 
							
						 
					 
					
						
						
							
							* Working parser with the decision memory idea. Disabling that for now, for simplicity  
						
						
						
					 
					
						2014-12-20 01:43:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a2f2a48da9 
							
						 
					 
					
						
						
							
							* Add some extra features  
						
						
						
					 
					
						2014-12-20 01:42:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8fd9762d91 
							
						 
					 
					
						
						
							
							* Start laying out parse tree iteration methods  
						
						
						
					 
					
						2014-12-20 01:42:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							53b8bc1f3c 
							
						 
					 
					
						
						
							
							* Work on implementing a trainable cache for the parser. So far, doesn't improve efficiency  
						
						
						
					 
					
						2014-12-19 09:30:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							033d6c9ac2 
							
						 
					 
					
						
						
							
							* Adapt POS tagger decision-memory for use in parser  
						
						
						
					 
					
						2014-12-19 07:23:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							809ddf7887 
							
						 
					 
					
						
						
							
							* Add index.pxd  
						
						
						
					 
					
						2014-12-19 07:23:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1879abd16a 
							
						 
					 
					
						
						
							
							* Set const-correctness for tagger  
						
						
						
					 
					
						2014-12-18 20:41:52 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f72243b156 
							
						 
					 
					
						
						
							
							* Set const-correctness for Feature* array  
						
						
						
					 
					
						2014-12-18 20:41:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6ab7e40590 
							
						 
					 
					
						
						
							
							* Add non-monotonic parsing with cost-sensitive update. 92.26 on Y&M set  
						
						
						
					 
					
						2014-12-18 11:33:25 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7e0c692daf 
							
						 
					 
					
						
						
							
							* Automatically push when the stack is empty  
						
						
						
					 
					
						2014-12-18 09:16:10 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							61142a8eff 
							
						 
					 
					
						
						
							
							* Tweak features  
						
						
						
					 
					
						2014-12-18 09:15:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8446ebfbbb 
							
						 
					 
					
						
						
							
							* Work on parser. Up to 92 UAS on YM labels  
						
						
						
					 
					
						2014-12-18 09:05:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							55de747bfc 
							
						 
					 
					
						
						
							
							* Remove .cpp files  
						
						
						
					 
					
						2014-12-18 02:43:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4448a840f7 
							
						 
					 
					
						
						
							
							* Work on greedy parsing. Scoring about 91.2  
						
						
						
					 
					
						2014-12-18 02:42:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							87e9487d76 
							
						 
					 
					
						
						
							
							* Work on parser  
						
						
						
					 
					
						2014-12-17 21:10:12 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9d7d97978d 
							
						 
					 
					
						
						
							
							* Work on greedy parser  
						
						
						
					 
					
						2014-12-17 21:09:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d524dd306a 
							
						 
					 
					
						
						
							
							* Work on greedy parser  
						
						
						
					 
					
						2014-12-17 03:19:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							95ccea03b2 
							
						 
					 
					
						
						
							
							* Work on greedy parser  
						
						
						
					 
					
						2014-12-16 22:46:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a432862fde 
							
						 
					 
					
						
						
							
							* Add exception type to _arg_max_among in tagger  
						
						
						
					 
					
						2014-12-16 09:44:19 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9e00798820 
							
						 
					 
					
						
						
							
							* Work on integrating a greedy dependency parser  
						
						
						
					 
					
						2014-12-16 08:06:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							792802b2b9 
							
						 
					 
					
						
						
							
							* POS tag memoisation working, with good speed-up  
						
						
						
					 
					
						2014-12-12 14:33:51 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ca54d58638 
							
						 
					 
					
						
						
							
							* Merge setup.py  
						
						
						
					 
					
						2014-12-10 15:21:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9959a64f7b 
							
						 
					 
					
						
						
							
							* Working morphology and lemmatisation. POS tagging quite fast.  
						
						
						
					 
					
						2014-12-10 08:09:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df3be14987 
							
						 
					 
					
						
						
							
							* Add pos_type features to POS tagger  
						
						
						
					 
					
						2014-12-10 08:08:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							42973c4b37 
							
						 
					 
					
						
						
							
							* Improve efficiency of tagger, and improve morphological processing  
						
						
						
					 
					
						2014-12-10 01:02:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6b34a2f34b 
							
						 
					 
					
						
						
							
							* Move morphological analysis into its own module, morphology.pyx  
						
						
						
					 
					
						2014-12-09 21:16:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b962fe73d7 
							
						 
					 
					
						
						
							
							* Make suffixes file use full-power regex, so that we can handle periods properly  
						
						
						
					 
					
						2014-12-09 19:04:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							accdbe989b 
							
						 
					 
					
						
						
							
							* Remove Tokens.extend method  
						
						
						
					 
					
						2014-12-09 17:09:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							495e1c7366 
							
						 
					 
					
						
						
							
							* Use fused type in Tokens.push_back, simplifying the use of the cache  
						
						
						
					 
					
						2014-12-09 16:50:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							302e09018b 
							
						 
					 
					
						
						
							
							* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas  
						
						
						
					 
					
						2014-12-09 14:48:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							99bbbb6feb 
							
						 
					 
					
						
						
							
							* Work on morphological processing  
						
						
						
					 
					
						2014-12-08 21:12:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7b68f911cf 
							
						 
					 
					
						
						
							
							* Add WordNet lemmatizer  
						
						
						
					 
					
						2014-12-08 01:39:13 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c20dd79748 
							
						 
					 
					
						
						
							
							* Fiddle with const correctness and comments  
						
						
						
					 
					
						2014-12-08 00:03:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b031c7c430 
							
						 
					 
					
						
						
							
							* Remove language-general context module  
						
						
						
					 
					
						2014-12-07 23:53:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ef4398b204 
							
						 
					 
					
						
						
							
							* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules  
						
						
						
					 
					
						2014-12-07 23:52:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							327383e38a 
							
						 
					 
					
						
						
							
							* Remove unused code in tagger.pyx  
						
						
						
					 
					
						2014-12-07 22:16:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f17467c2e 
							
						 
					 
					
						
						
							
							* Fix EMPTY_TOKEN  
						
						
						
					 
					
						2014-12-07 22:07:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3819a88e1b 
							
						 
					 
					
						
						
							
							* Add support for tag dictionary, and fix error-code for predict method  
						
						
						
					 
					
						2014-12-07 22:07:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f00afe12c4 
							
						 
					 
					
						
						
							
							* Load POS tagger in load() function if path exists  
						
						
						
					 
					
						2014-12-07 22:05:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5fe5e6e66b 
							
						 
					 
					
						
						
							
							* Move context functions to header, inlining them.  
						
						
						
					 
					
						2014-12-07 21:59:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5caabec789 
							
						 
					 
					
						
						
							
							* Link in tagger, to work on integrating POS tagging  
						
						
						
					 
					
						2014-12-07 15:29:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0c7aeb9de7 
							
						 
					 
					
						
						
							
							* Begin revising tagger, focussing on POS tagging  
						
						
						
					 
					
						2014-12-07 15:29:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f5c4f2eb52 
							
						 
					 
					
						
						
							
							* Revise context, focussing on POS tagging for now  
						
						
						
					 
					
						2014-12-07 15:28:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e27b912ef9 
							
						 
					 
					
						
						
							
							* Remove need for confusing _data pointer to be stored on Tokens  
						
						
						
					 
					
						2014-12-05 16:31:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1c9253701d 
							
						 
					 
					
						
						
							
							* Introduce a TokenC struct, to handle token indices, pos tags and sense tags  
						
						
						
					 
					
						2014-12-05 15:56:14 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							187372c7f3 
							
						 
					 
					
						
						
							
							* Allow the lexicon to create lexemes using an external memory pool, so that it can decide to make some lexemes temporary, rather than cached  
						
						
						
					 
					
						2014-12-05 03:29:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							75b8dfb348 
							
						 
					 
					
						
						
							
							* Remove upper_pc from lexeme.pyx  
						
						
						
					 
					
						2014-12-04 22:14:34 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							49f3780ff5 
							
						 
					 
					
						
						
							
							* Fiddle with lexeme attrs  
						
						
						
					 
					
						2014-12-04 21:22:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							564082e48e 
							
						 
					 
					
						
						
							
							* Hack Token class to take lex.dense inplace of the old lex.norm. This needs to be fixed...  
						
						
						
					 
					
						2014-12-04 20:51:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							69bb022204 
							
						 
					 
					
						
						
							
							* Add as_array and count_by method  
						
						
						
					 
					
						2014-12-04 20:46:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e1b1f45cc9 
							
						 
					 
					
						
						
							
							* Add STEM attribute to lexeme  
						
						
						
					 
					
						2014-12-04 20:46:20 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d7952634ca 
							
						 
					 
					
						
						
							
							* Make the string-store serve const pointers to Utf8Str  
						
						
						
					 
					
						2014-12-03 16:01:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7e04c22f8f 
							
						 
					 
					
						
						
							
							* const added to Lexicon interface. Seems to work.  
						
						
						
					 
					
						2014-12-03 15:58:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d70d31aa45 
							
						 
					 
					
						
						
							
							* Introduce first attempt at const-ness  
						
						
						
					 
					
						2014-12-03 15:44:25 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4560ada85b 
							
						 
					 
					
						
						
							
							* Add typedef for attr_t. Change flag_t to flags_t  
						
						
						
					 
					
						2014-12-03 11:06:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e600f7b327 
							
						 
					 
					
						
						
							
							* Move String struct stuff into the utf8string module, from spacy.lang  
						
						
						
					 
					
						2014-12-03 11:06:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e170faf5b0 
							
						 
					 
					
						
						
							
							* Hack Tokens to work without tagger.pyx  
						
						
						
					 
					
						2014-12-03 11:05:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b463a7eb86 
							
						 
					 
					
						
						
							
							* Make flag-setting a language-specific thing  
						
						
						
					 
					
						2014-12-03 11:04:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							71b009e323 
							
						 
					 
					
						
						
							
							* Fix bug in refactored StringStore.__getitem__  
						
						
						
					 
					
						2014-12-03 11:02:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							14097311ae 
							
						 
					 
					
						
						
							
							* Make StringStore.__getitem__ accept unicode-typed keys.  
						
						
						
					 
					
						2014-12-03 01:33:20 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							522bb0346e 
							
						 
					 
					
						
						
							
							* Work on get_array method of Tokens  
						
						
						
					 
					
						2014-12-02 23:48:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8c2938fe01 
							
						 
					 
					
						
						
							
							* Rename Lexicon._dict to Lexicon._map  
						
						
						
					 
					
						2014-12-02 23:46:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							33dfb4933c 
							
						 
					 
					
						
						
							
							* Remove taggers from Language class. Work on doc strings  
						
						
						
					 
					
						2014-11-26 19:53:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							80baa2e3db 
							
						 
					 
					
						
						
							
							* Work on beam parser  
						
						
						
					 
					
						2014-11-20 19:49:33 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5c3016bac8 
							
						 
					 
					
						
						
							
							* Tmp commit of ner code  
						
						
						
					 
					
						2014-11-14 18:27:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							33c421bcf8 
							
						 
					 
					
						
						
							
							* More feature tweaks  
						
						
						
					 
					
						2014-11-12 23:59:16 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							41dedfb14e 
							
						 
					 
					
						
						
							
							* Add label features for NER parsing  
						
						
						
					 
					
						2014-11-12 23:55:10 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cf55b48ba6 
							
						 
					 
					
						
						
							
							* Switch to predict label on shift. Big increase in accuracy.  
						
						
						
					 
					
						2014-11-12 23:50:12 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8f84e8a78b 
							
						 
					 
					
						
						
							
							* Neaten oracle  
						
						
						
					 
					
						2014-11-12 23:38:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7e0a9077dd 
							
						 
					 
					
						
						
							
							* Add context files  
						
						
						
					 
					
						2014-11-12 23:22:36 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3b0b902384 
							
						 
					 
					
						
						
							
							* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86  
						
						
						
					 
					
						2014-11-12 23:21:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e6bb8aa3a9 
							
						 
					 
					
						
						
							
							* Move moves to bilou_moves. Refactor context, returning to the simpler giant-enum style  
						
						
						
					 
					
						2014-11-12 00:54:50 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c788633429 
							
						 
					 
					
						
						
							
							* Add tokens_from_list method to Language  
						
						
						
					 
					
						2014-11-11 23:43:14 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							95282d4993 
							
						 
					 
					
						
						
							
							* Use the dynamic oracle 'follow' strategy  
						
						
						
					 
					
						2014-11-11 21:11:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5aaf7a024d 
							
						 
					 
					
						
						
							
							* Move ner features to ner subdir  
						
						
						
					 
					
						2014-11-11 21:09:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff8989b63c 
							
						 
					 
					
						
						
							
							* Use greedy NER parser  
						
						
						
					 
					
						2014-11-11 21:08:35 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0d943ab358 
							
						 
					 
					
						
						
							
							* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger.  
						
						
						
					 
					
						2014-11-11 17:17:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							399239760b 
							
						 
					 
					
						
						
							
							* Fix moves for new State struct  
						
						
						
					 
					
						2014-11-10 22:16:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							82247169f2 
							
						 
					 
					
						
						
							
							* Implement validation and oracle on pystate, for testing  
						
						
						
					 
					
						2014-11-10 22:15:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3709ed9d6d 
							
						 
					 
					
						
						
							
							* Add curr field to State, to handle entity being built  
						
						
						
					 
					
						2014-11-10 22:14:36 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							af9ed18cf1 
							
						 
					 
					
						
						
							
							* Bug fixes to NER  
						
						
						
					 
					
						2014-11-10 17:39:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9f2587f5ec 
							
						 
					 
					
						
						
							
							* Work on shift-reduce NER  
						
						
						
					 
					
						2014-11-10 16:28:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f307eb2e36 
							
						 
					 
					
						
						
							
							* Refactor context extraction, and start breaking out gold standards into their own functions  
						
						
						
					 
					
						2014-11-09 15:43:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							602f993af9 
							
						 
					 
					
						
						
							
							* Moving tagger to accept multiple correct answers  
						
						
						
					 
					
						2014-11-09 15:18:33 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f37d896a42 
							
						 
					 
					
						
						
							
							* Upd NER feats. With adadelta learner, getting 76.9 on NER  
						
						
						
					 
					
						2014-11-07 04:43:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							68d1cdad62 
							
						 
					 
					
						
						
							
							* When encoding POS/NER tags, accept '-' as a missing value  
						
						
						
					 
					
						2014-11-07 04:42:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							949a6245f9 
							
						 
					 
					
						
						
							
							* Increase default number of iterations from 5 to 10  
						
						
						
					 
					
						2014-11-07 04:42:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3cab1d9a29 
							
						 
					 
					
						
						
							
							* Refine word_shape feature, by trimming the max sequence length  
						
						
						
					 
					
						2014-11-07 04:41:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b4454cf036 
							
						 
					 
					
						
						
							
							* Add extra context tokens  
						
						
						
					 
					
						2014-11-07 04:40:36 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							50309e6e49 
							
						 
					 
					
						
						
							
							* Fix context vector, importing all features  
						
						
						
					 
					
						2014-11-05 22:11:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							07a23768de 
							
						 
					 
					
						
						
							
							* Play with NER feats a bit. Up to 82.00 training on MUC7.  
						
						
						
					 
					
						2014-11-05 21:47:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4ecbe8c893 
							
						 
					 
					
						
						
							
							* Complete refactor of Tagger features, to use a generic list of context names.  
						
						
						
					 
					
						2014-11-05 20:45:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0a8c84625d 
							
						 
					 
					
						
						
							
							* Moving feature context stuff to a generalized place  
						
						
						
					 
					
						2014-11-05 19:55:10 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3733444101 
							
						 
					 
					
						
						
							
							* Generalize tagger code, in preparation for NER and supersense tagging.  
						
						
						
					 
					
						2014-11-05 03:42:14 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							abbe3e44b0 
							
						 
					 
					
						
						
							
							* Move spacy.pos tagger to spacy.tagger, and generalize it so that it can take on other tagging tasks, given a different set of feature templates.  
						
						
						
					 
					
						2014-11-05 00:37:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							954c970415 
							
						 
					 
					
						
						
							
							* Add __iter__ method to tokens  
						
						
						
					 
					
						2014-11-04 01:07:08 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f07457a91f 
							
						 
					 
					
						
						
							
							* Remove POS alignment stuff. Now use training data based on raw text, instead of clumsy detokenization stuff  
						
						
						
					 
					
						2014-11-04 01:06:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ae52f9f38c 
							
						 
					 
					
						
						
							
							* Remove vocab10k from tokens  
						
						
						
					 
					
						2014-11-03 00:23:20 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							32fb50dc35 
							
						 
					 
					
						
						
							
							* Remove non_sparse method --- features wanting this can do it easily enough.  
						
						
						
					 
					
						2014-11-03 00:15:47 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b5ae1471db 
							
						 
					 
					
						
						
							
							* Fiddle with POS tag features  
						
						
						
					 
					
						2014-11-03 00:15:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							70ea862703 
							
						 
					 
					
						
						
							
							* Remove vocab10k field, and add flags for gazetteers  
						
						
						
					 
					
						2014-11-03 00:13:51 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							711ed0f636 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2014-11-02 14:22:32 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fcd9490d56 
							
						 
					 
					
						
						
							
							* Add pos_tag method to Language  
						
						
						
					 
					
						2014-11-02 14:21:43 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							829bb2bdbe 
							
						 
					 
					
						
						
							
							* Add mappings to Twitter POS tag corpus  
						
						
						
					 
					
						2014-11-02 13:21:19 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							437cd2217d 
							
						 
					 
					
						
						
							
							* Fix strings i/o, removing use of ujson library in favour of plain text file. Allows better control of codecs.  
						
						
						
					 
					
						2014-11-02 13:20:37 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3352e89e21 
							
						 
					 
					
						
						
							
							* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web  
						
						
						
					 
					
						2014-11-02 13:19:54 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8335706321 
							
						 
					 
					
						
						
							
							* Add LIKE_URL and LIKE_NUMBER flag features  
						
						
						
					 
					
						2014-11-02 13:19:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5484fbea69 
							
						 
					 
					
						
						
							
							* Implement is_number  
						
						
						
					 
					
						2014-11-01 19:13:24 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f685218e21 
							
						 
					 
					
						
						
							
							* Add is_urlish function  
						
						
						
					 
					
						2014-11-01 17:39:34 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							09a3e54176 
							
						 
					 
					
						
						
							
							* Delete print statements from stringstore  
						
						
						
					 
					
						2014-10-31 17:45:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b186a66bae 
							
						 
					 
					
						
						
							
							* Rename Token.lex_pos to Token.postype, and Token.lex_supersense to Token.sensetype  
						
						
						
					 
					
						2014-10-31 17:44:39 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a8ca078b24 
							
						 
					 
					
						
						
							
							* Restore lexemes field to lexicon  
						
						
						
					 
					
						2014-10-31 17:43:25 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c807aa45f 
							
						 
					 
					
						
						
							
							* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries  
						
						
						
					 
					
						2014-10-31 17:43:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							aaf6953fe0 
							
						 
					 
					
						
						
							
							* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web.  
						
						
						
					 
					
						2014-10-31 17:42:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f67cb9a5a3 
							
						 
					 
					
						
						
							
							* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web.  
						
						
						
					 
					
						2014-10-31 17:42:04 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea8f1e7053 
							
						 
					 
					
						
						
							
							* Tighten interfaces  
						
						
						
					 
					
						2014-10-30 18:14:42 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea85bf3a0a 
							
						 
					 
					
						
						
							
							* Tighten the interface to Language  
						
						
						
					 
					
						2014-10-30 18:01:27 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c6fcd03692 
							
						 
					 
					
						
						
							
							* Small efficiency tweak to lexeme init  
						
						
						
					 
					
						2014-10-30 17:56:11 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							87c2418a89 
							
						 
					 
					
						
						
							
							* Fiddle with data types on Lexeme, to compress them to a much smaller size.  
						
						
						
					 
					
						2014-10-30 15:42:15 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ac88893232 
							
						 
					 
					
						
						
							
							* Fix Token after lexeme changes  
						
						
						
					 
					
						2014-10-30 15:30:52 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e6b87766fe 
							
						 
					 
					
						
						
							
							* Remove lexemes vector from Lexicon, and the id and hash attributes from Lexeme  
						
						
						
					 
					
						2014-10-30 15:21:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							889b7b48b4 
							
						 
					 
					
						
						
							
							* Fix POS tagger, so that it loads correctly. Lexemes are being read in.  
						
						
						
					 
					
						2014-10-30 13:38:55 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							67c8c8019f 
							
						 
					 
					
						
						
							
							* Update lexeme serialization, using a binary file format  
						
						
						
					 
					
						2014-10-30 01:01:00 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							13909a2e24 
							
						 
					 
					
						
						
							
							* Rewriting Lexeme serialization.  
						
						
						
					 
					
						2014-10-29 23:19:38 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							234d49bf4d 
							
						 
					 
					
						
						
							
							* Seems to be working after refactor. Need to wire up more POS tag features, and wire up save/load of POS tags.  
						
						
						
					 
					
						2014-10-24 02:23:42 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							08ce602243 
							
						 
					 
					
						
						
							
							* Large refactor, particularly to Python API  
						
						
						
					 
					
						2014-10-24 00:59:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7baef5b7ff 
							
						 
					 
					
						
						
							
							* Fix padding on tokens  
						
						
						
					 
					
						2014-10-23 04:01:17 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							96b835a3d4 
							
						 
					 
					
						
						
							
							* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags.  
						
						
						
					 
					
						2014-10-23 03:20:02 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e5e951ae67 
							
						 
					 
					
						
						
							
							* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding.  
						
						
						
					 
					
						2014-10-23 01:57:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ea1d4a81eb 
							
						 
					 
					
						
						
							
							* Refactoring get_atoms, improving tokens API  
						
						
						
					 
					
						2014-10-22 13:10:56 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ad49e2482e 
							
						 
					 
					
						
						
							
							* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text.  
						
						
						
					 
					
						2014-10-22 12:57:06 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0a0e41f6c8 
							
						 
					 
					
						
						
							
							* Add prefix and suffix features  
						
						
						
					 
					
						2014-10-22 12:56:09 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7018b53d3a 
							
						 
					 
					
						
						
							
							* Improve array features in tokens  
						
						
						
					 
					
						2014-10-22 12:55:42 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							43d5964e13 
							
						 
					 
					
						
						
							
							* Add function to read detokenization rules  
						
						
						
					 
					
						2014-10-22 12:54:59 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							224bdae996 
							
						 
					 
					
						
						
							
							* Add POS utilities  
						
						
						
					 
					
						2014-10-22 10:17:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5ebe14f353 
							
						 
					 
					
						
						
							
							* Add greedy pos tagger  
						
						
						
					 
					
						2014-10-22 10:17:26 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							12742f4f83 
							
						 
					 
					
						
						
							
							* Add detokenize method and test  
						
						
						
					 
					
						2014-10-18 18:07:29 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							99f5e59286 
							
						 
					 
					
						
						
							
							* Have tokenizer emit tokens for whitespace other than single spaces  
						
						
						
					 
					
						2014-10-14 20:25:57 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							43743a5d63 
							
						 
					 
					
						
						
							
							* Work on efficiency  
						
						
						
					 
					
						2014-10-14 18:22:41 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6fb42c4919 
							
						 
					 
					
						
						
							
							* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang  
						
						
						
					 
					
						2014-10-14 16:17:45 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2805068ca8 
							
						 
					 
					
						
						
							
							* Have tokens track tuples that record the start offset and pos tag as well as a lexeme pointer  
						
						
						
					 
					
						2014-10-14 15:21:03 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							65d3ead4fd 
							
						 
					 
					
						
						
							
							* Rename LexStr_casefix to LexStr_norm and LexInt_i to LexInt_id  
						
						
						
					 
					
						2014-10-14 15:19:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							868e558037 
							
						 
					 
					
						
						
							
							* Preparations in place to handle hyphenation etc  
						
						
						
					 
					
						2014-10-10 20:23:23 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff79dbac2e 
							
						 
					 
					
						
						
							
							* More slight cleaning for lang.pyx  
						
						
						
					 
					
						2014-10-10 20:11:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3d82ed1e5e 
							
						 
					 
					
						
						
							
							* More slight cleaning for lang.pyx  
						
						
						
					 
					
						2014-10-10 19:50:07 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							02e948e7d5 
							
						 
					 
					
						
						
							
							* Remove counts stuff from Language class  
						
						
						
					 
					
						2014-10-10 19:25:01 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							71ee921055 
							
						 
					 
					
						
						
							
							* Slight cleaning of tokenizer code  
						
						
						
					 
					
						2014-10-10 19:17:22 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							59b41a9fd3 
							
						 
					 
					
						
						
							
							* Switch to new data model, tests passing  
						
						
						
					 
					
						2014-10-10 08:11:31 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1b0e01d3d8 
							
						 
					 
					
						
						
							
							* Revising data model of lexeme. Compiles.  
						
						
						
					 
					
						2014-10-09 19:53:30 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e40caae51f 
							
						 
					 
					
						
						
							
							* Update Lexicon class to expect a list of lexeme dict descriptions  
						
						
						
					 
					
						2014-10-09 14:51:35 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							51d75b244b 
							
						 
					 
					
						
						
							
							* Add serialize/deserialize functions for lexeme, transport to/from python dict.  
						
						
						
					 
					
						2014-10-09 14:10:46 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d73d89a2de 
							
						 
					 
					
						
						
							
							* Add i attribute to lexeme, giving lexemes sequential IDs.  
						
						
						
					 
					
						2014-10-09 13:50:05 +11:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							096ef2b199 
							
						 
					 
					
						
						
							
							* Rename external hashing lib, from trustyc to preshed  
						
						
						
					 
					
						2014-09-26 18:40:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							11a346fd5e 
							
						 
					 
					
						
						
							
							* Remove hashing modules, which are now taken over by external lib  
						
						
						
					 
					
						2014-09-26 18:39:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							93505276ed 
							
						 
					 
					
						
						
							
							* Add German tokenizer files  
						
						
						
					 
					
						2014-09-25 18:29:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2e44fa7179 
							
						 
					 
					
						
						
							
							* Add util.py  
						
						
						
					 
					
						2014-09-25 18:26:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b15619e170 
							
						 
					 
					
						
						
							
							* Use PointerHash instead of locally provided _hashing module  
						
						
						
					 
					
						2014-09-25 18:23:35 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ed446c67ad 
							
						 
					 
					
						
						
							
							* Add typedefs file  
						
						
						
					 
					
						2014-09-17 23:10:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							316a57c4be 
							
						 
					 
					
						
						
							
							* Remove own memory classes, which have now been broken out into their own package  
						
						
						
					 
					
						2014-09-17 23:10:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ac522e2553 
							
						 
					 
					
						
						
							
							* Switch from own memory class to cymem, in pip  
						
						
						
					 
					
						2014-09-17 23:09:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6266cac593 
							
						 
					 
					
						
						
							
							* Switch to using a Python ref counted gateway to malloc/free, to prevent memory leaks  
						
						
						
					 
					
						2014-09-17 20:02:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5a20dfc03e 
							
						 
					 
					
						
						
							
							* Add memory management code  
						
						
						
					 
					
						2014-09-17 20:02:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0152831c89 
							
						 
					 
					
						
						
							
							* Refactor tokenization, enable cache, and ensure we look up specials correctly even when there's confusing punctuation surrounding the token.  
						
						
						
					 
					
						2014-09-16 18:01:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							143e51ec73 
							
						 
					 
					
						
						
							
							* Refactor tokenization, splitting it into a clearer life-cycle.  
						
						
						
					 
					
						2014-09-16 13:16:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c396581a0b 
							
						 
					 
					
						
						
							
							* Fiddle with the way strings are interned in lexeme  
						
						
						
					 
					
						2014-09-15 06:34:45 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0bb547ab98 
							
						 
					 
					
						
						
							
							* Fix memory error in cache, where entry wasn't being null-terminated. Various other changes, some good for performance  
						
						
						
					 
					
						2014-09-15 06:34:10 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7959141d36 
							
						 
					 
					
						
						
							
							* Add a few abbreviations, to get tests to pass  
						
						
						
					 
					
						2014-09-15 06:32:18 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d235299260 
							
						 
					 
					
						
						
							
							* Few nips and tucks to hash table  
						
						
						
					 
					
						2014-09-15 05:03:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e68a431e5e 
							
						 
					 
					
						
						
							
							* Pass only the tokens vector to _tokenize, instead of the whole python object.  
						
						
						
					 
					
						2014-09-15 04:01:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							08cef75ffd 
							
						 
					 
					
						
						
							
							* Switch to using a heap-allocated vector in tokens  
						
						
						
					 
					
						2014-09-15 03:46:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f77b7098c0 
							
						 
					 
					
						
						
							
							* Upd Tokens to use vector, with bounds checking.  
						
						
						
					 
					
						2014-09-15 03:22:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0f6bf2a2ee 
							
						 
					 
					
						
						
							
							* Fix niggling memory error, which was caused by bug in the way tokens resized their internal vector.  
						
						
						
					 
					
						2014-09-15 02:08:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df24e3708c 
							
						 
					 
					
						
						
							
							* Move EnglishTokens stuff to Tokens  
						
						
						
					 
					
						2014-09-15 01:31:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bd08cb09a2 
							
						 
					 
					
						
						
							
							* Remove short-circuiting of initial_size argument for PointerHash  
						
						
						
					 
					
						2014-09-15 01:30:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f3393cf57c 
							
						 
					 
					
						
						
							
							* Improve interface for PointerHash  
						
						
						
					 
					
						2014-09-13 17:29:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							45865be37e 
							
						 
					 
					
						
						
							
							* Switch hash interface, using void* instead of size_t, to avoid casts.  
						
						
						
					 
					
						2014-09-13 17:02:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0447279c57 
							
						 
					 
					
						
						
							
							* PointerHash working, efficiency is good. 6-7 mins  
						
						
						
					 
					
						2014-09-13 16:43:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							85d68e8e95 
							
						 
					 
					
						
						
							
							* Replaced cache with own hash table. Similar timing  
						
						
						
					 
					
						2014-09-13 03:14:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c8db76e3e1 
							
						 
					 
					
						
						
							
							* Add initial work on simple hash table  
						
						
						
					 
					
						2014-09-13 02:02:41 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							afdc9b7ac2 
							
						 
					 
					
						
						
							
							* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize  
						
						
						
					 
					
						2014-09-13 00:59:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7d239df4c8 
							
						 
					 
					
						
						
							
							* Fiddle with declarations, for small efficiency boost  
						
						
						
					 
					
						2014-09-13 00:31:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a8e7cce30f 
							
						 
					 
					
						
						
							
							* Efficiency tweaks  
						
						
						
					 
					
						2014-09-13 00:14:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							126a8453a5 
							
						 
					 
					
						
						
							
							* Fix performance issues by implementing a better cache. Add own String struct to help  
						
						
						
					 
					
						2014-09-12 23:50:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9298e36b36 
							
						 
					 
					
						
						
							
							* Move special tokenization into its own lookup table, away from the cache.  
						
						
						
					 
					
						2014-09-12 19:43:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							985bc68327 
							
						 
					 
					
						
						
							
							* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation.  
						
						
						
					 
					
						2014-09-12 18:26:26 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7eab281194 
							
						 
					 
					
						
						
							
							* Fiddle with token features  
						
						
						
					 
					
						2014-09-12 15:49:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5aa591106b 
							
						 
					 
					
						
						
							
							* Fiddle with token features  
						
						
						
					 
					
						2014-09-12 15:49:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1533041885 
							
						 
					 
					
						
						
							
							* Update the split_one method, so that it doesn't need to cast back to a Python object  
						
						
						
					 
					
						2014-09-12 05:10:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4817277d66 
							
						 
					 
					
						
						
							
							* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery.  
						
						
						
					 
					
						2014-09-12 04:29:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8b20e9ad97 
							
						 
					 
					
						
						
							
							* Delete ununused _split method  
						
						
						
					 
					
						2014-09-12 04:03:52 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a4863686ec 
							
						 
					 
					
						
						
							
							* Changed cache to use a linked-list data structure, to take out Python list code. Taking 6-7 mins for gigaword.  
						
						
						
					 
					
						2014-09-12 03:30:50 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							51e2006a65 
							
						 
					 
					
						
						
							
							* Increase cache size. Processing now 6-7 mins  
						
						
						
					 
					
						2014-09-12 02:52:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e096f30161 
							
						 
					 
					
						
						
							
							* Tweak signatures and refactor slightly. Processing gigaword taking 8-9 mins. Tests passing, but some sort of memory bug on exit.  
						
						
						
					 
					
						2014-09-12 02:43:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							073ee0de63 
							
						 
					 
					
						
						
							
							* Restore dense_hash_map for cache dictionary. Seems to double efficiency  
						
						
						
					 
					
						2014-09-12 02:23:51 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3c928fb5e0 
							
						 
					 
					
						
						
							
							* Switch to 64 bit hashes, for better reliability  
						
						
						
					 
					
						2014-09-12 02:04:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							2389bd1b10 
							
						 
					 
					
						
						
							
							* Improve cache mechanism by including a random element depending on the size of the cache.  
						
						
						
					 
					
						2014-09-12 00:19:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c8f7c8bfde 
							
						 
					 
					
						
						
							
							* Moving to storing LexemeC structs internally  
						
						
						
					 
					
						2014-09-11 21:54:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bf9c60c31c 
							
						 
					 
					
						
						
							
							* Moving to storing LexemeC structs internally  
						
						
						
					 
					
						2014-09-11 21:44:58 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							563047e90f 
							
						 
					 
					
						
						
							
							* Switch to returning a Tokens object  
						
						
						
					 
					
						2014-09-11 21:37:32 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1a3222af4b 
							
						 
					 
					
						
						
							
							* Moving tokens to use an array internally, instead of a list of Lexeme objects.  
						
						
						
					 
					
						2014-09-11 16:57:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b1c651661 
							
						 
					 
					
						
						
							
							* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon.  
						
						
						
					 
					
						2014-09-11 12:28:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e567713429 
							
						 
					 
					
						
						
							
							* Moving back to lexeme structs  
						
						
						
					 
					
						2014-09-10 20:41:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b488224c09 
							
						 
					 
					
						
						
							
							* Restoring Lexeme-as-struct  
						
						
						
					 
					
						2014-09-10 20:41:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							7c09c73a14 
							
						 
					 
					
						
						
							
							* Refactor to use tokens class.  
						
						
						
					 
					
						2014-09-10 18:27:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							cf412adba8 
							
						 
					 
					
						
						
							
							* Refactoring to use Tokens object  
						
						
						
					 
					
						2014-09-10 18:11:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8fbe9b6f97 
							
						 
					 
					
						
						
							
							* Bug fixes to flag features  
						
						
						
					 
					
						2014-09-01 23:41:31 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							151aa14bba 
							
						 
					 
					
						
						
							
							* Add asciify string transform, and other bits.  
						
						
						
					 
					
						2014-09-01 23:25:28 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c4ba216642 
							
						 
					 
					
						
						
							
							* Switch canon_case to get value, to avoid keyerror  
						
						
						
					 
					
						2014-09-01 17:27:36 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a779275a59 
							
						 
					 
					
						
						
							
							* Add canon_case function  
						
						
						
					 
					
						2014-08-30 20:57:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8bbfadfced 
							
						 
					 
					
						
						
							
							* Pass tests. Need to implement more feature functions.  
						
						
						
					 
					
						2014-08-30 20:36:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dcab14ede2 
							
						 
					 
					
						
						
							
							* Begin testing more functionality  
						
						
						
					 
					
						2014-08-30 19:01:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3e3ff99ca0 
							
						 
					 
					
						
						
							
							* Add orth features  
						
						
						
					 
					
						2014-08-30 19:01:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e5b2d47e2 
							
						 
					 
					
						
						
							
							* More docs  
						
						
						
					 
					
						2014-08-29 03:01:40 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5233f110c4 
							
						 
					 
					
						
						
							
							* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers  
						
						
						
					 
					
						2014-08-29 02:30:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							45a22d6b2c 
							
						 
					 
					
						
						
							
							* Docs coming together  
						
						
						
					 
					
						2014-08-29 01:59:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c282e6d5fb 
							
						 
					 
					
						
						
							
							* Redesign proceeding  
						
						
						
					 
					
						2014-08-28 19:45:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fd4e61e58b 
							
						 
					 
					
						
						
							
							* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.  
						
						
						
					 
					
						2014-08-27 20:22:33 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fdaf24604a 
							
						 
					 
					
						
						
							
							* Basic punct tests updated and passing  
						
						
						
					 
					
						2014-08-27 19:38:57 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8d20617dfd 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2014-08-27 17:16:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e9a62b6eba 
							
						 
					 
					
						
						
							
							* Refactoring with Lexeme as a class now compiles. Basic design seems to work  
						
						
						
					 
					
						2014-08-27 17:15:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							68bae2fec6 
							
						 
					 
					
						
						
							
							* More refactoring  
						
						
						
					 
					
						2014-08-25 16:42:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							88095666dc 
							
						 
					 
					
						
						
							
							* Remove Lexeme struct, preparing to rename Word to Lexeme.  
						
						
						
					 
					
						2014-08-24 19:24:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ce59526011 
							
						 
					 
					
						
						
							
							* Add Word classes  
						
						
						
					 
					
						2014-08-24 18:14:08 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3b793cf4f7 
							
						 
					 
					
						
						
							
							* Tests passing for new Word object version  
						
						
						
					 
					
						2014-08-24 18:13:53 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							9815c7649e 
							
						 
					 
					
						
						
							
							* Refactor around Word objects, adapting tests. Tests passing, except for string views.  
						
						
						
					 
					
						2014-08-23 19:55:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4f01df9152 
							
						 
					 
					
						
						
							
							* Moving to Word objects in place of the Lexeme struct.  
						
						
						
					 
					
						2014-08-22 17:32:16 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							782806df08 
							
						 
					 
					
						
						
							
							* Moving to Word objects in place of the Lexeme struct.  
						
						
						
					 
					
						2014-08-22 17:28:23 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							47fbd0475a 
							
						 
					 
					
						
						
							
							* Replace the use of dense_hash_map with Python dict  
						
						
						
					 
					
						2014-08-22 17:13:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e289896603 
							
						 
					 
					
						
						
							
							* Fix ptb3 module  
						
						
						
					 
					
						2014-08-22 16:36:17 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							89d6faa9c9 
							
						 
					 
					
						
						
							
							* Move en_ptb to ptb3  
						
						
						
					 
					
						2014-08-22 04:24:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							07ecf5d2f4 
							
						 
					 
					
						
						
							
							* Fixed group_by, removed idea of general attr_of function.  
						
						
						
					 
					
						2014-08-22 00:02:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							811b7a6b91 
							
						 
					 
					
						
						
							
							* Struggling with arbitrary attr access...  
						
						
						
					 
					
						2014-08-21 23:49:14 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							314658b31c 
							
						 
					 
					
						
						
							
							* Improve module docstring  
						
						
						
					 
					
						2014-08-21 18:42:47 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d10993f41a 
							
						 
					 
					
						
						
							
							* More docs work  
						
						
						
					 
					
						2014-08-21 16:37:13 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							248cbb6d07 
							
						 
					 
					
						
						
							
							* Update doc strings  
						
						
						
					 
					
						2014-08-21 03:29:15 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							76afbd7d69 
							
						 
					 
					
						
						
							
							* Remove compiled orthography file  
						
						
						
					 
					
						2014-08-20 17:04:07 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f39dcb1d89 
							
						 
					 
					
						
						
							
							* Add orthography  
						
						
						
					 
					
						2014-08-20 17:03:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a78ad4152d 
							
						 
					 
					
						
						
							
							* Broken version being refactored for docs  
						
						
						
					 
					
						2014-08-20 13:39:39 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5fddb8d165 
							
						 
					 
					
						
						
							
							* Working refactor, with updated data model for Lexemes  
						
						
						
					 
					
						2014-08-19 04:21:20 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							3379d7a571 
							
						 
					 
					
						
						
							
							* Reforming data model for lexemes  
						
						
						
					 
					
						2014-08-19 02:40:37 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ab9b0daabf 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2014-08-18 23:21:49 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							1b71cbfe28 
							
						 
					 
					
						
						
							
							* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either.  
						
						
						
					 
					
						2014-08-18 20:48:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							bbf9a2c944 
							
						 
					 
					
						
						
							
							* Working version that uses arrays for chunks, which should be more memory efficient  
						
						
						
					 
					
						2014-08-18 20:23:54 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							8d3f6082be 
							
						 
					 
					
						
						
							
							* Working version, adding improvements  
						
						
						
					 
					
						2014-08-18 19:59:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							01469b0888 
							
						 
					 
					
						
						
							
							* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word.  
						
						
						
					 
					
						2014-08-18 19:14:00 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b94c9b72c9 
							
						 
					 
					
						
						
							
							* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached.  
						
						
						
					 
					
						2014-08-16 20:10:22 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							34b68a18ab 
							
						 
					 
					
						
						
							
							* Progress to getting WordTree working. Tests pass, but so far it's slower.  
						
						
						
					 
					
						2014-08-16 19:59:38 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							865cacfaf7 
							
						 
					 
					
						
						
							
							* Remove dependence on murmurhash  
						
						
						
					 
					
						2014-08-16 17:37:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							515d41d325 
							
						 
					 
					
						
						
							
							* Restore string saving to spacy  
						
						
						
					 
					
						2014-08-16 16:09:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							36073b89fe 
							
						 
					 
					
						
						
							
							* Restore unicode, work on improving string storage.  
						
						
						
					 
					
						2014-08-16 14:35:34 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a225ca5b0d 
							
						 
					 
					
						
						
							
							* Refactoring tokenizer  
						
						
						
					 
					
						2014-08-16 03:22:03 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							213a440ffc 
							
						 
					 
					
						
						
							
							* Add string decode and encode helpers to string_tools  
						
						
						
					 
					
						2014-08-15 23:57:27 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f11c8e22eb 
							
						 
					 
					
						
						
							
							* Remove happax stuff  
						
						
						
					 
					
						2014-08-02 22:11:28 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d6e07aa922 
							
						 
					 
					
						
						
							
							* Switch to 32bit hash for strings  
						
						
						
					 
					
						2014-08-02 21:51:52 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							365a2af756 
							
						 
					 
					
						
						
							
							* Restore happax. commit uncommited work  
						
						
						
					 
					
						2014-08-02 21:27:03 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6319ff0f22 
							
						 
					 
					
						
						
							
							* Add length property  
						
						
						
					 
					
						2014-08-02 21:26:44 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							18fb76b2c4 
							
						 
					 
					
						
						
							
							* Removed happax. Not sure if good idea.  
						
						
						
					 
					
						2014-08-02 20:53:35 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							edd38a84b1 
							
						 
					 
					
						
						
							
							* Removing happax stuff. Added length  
						
						
						
					 
					
						2014-08-02 20:45:12 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							fc7c10d7f8 
							
						 
					 
					
						
						
							
							* Ugly but seemingly working fix to the token memory leak  
						
						
						
					 
					
						2014-08-01 09:43:19 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c7bb6b329c 
							
						 
					 
					
						
						
							
							* Don't free clobbered lexemes, as they might be part of a tail  
						
						
						
					 
					
						2014-08-01 08:22:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							c48214460e 
							
						 
					 
					
						
						
							
							* Free lexemes clobbered as happaxes  
						
						
						
					 
					
						2014-08-01 07:40:20 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b6457e80e 
							
						 
					 
					
						
						
							
							* Free lexemes clobbered as happaxes  
						
						
						
					 
					
						2014-08-01 07:37:50 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d8cb2288ce 
							
						 
					 
					
						
						
							
							* Roll back to using murmurhash2 for now  
						
						
						
					 
					
						2014-08-01 07:28:47 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f39211b2b1 
							
						 
					 
					
						
						
							
							* Add FixedTable for hashing  
						
						
						
					 
					
						2014-08-01 07:27:21 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a44e15f623 
							
						 
					 
					
						
						
							
							* Hack around lack of distribution features for now.  
						
						
						
					 
					
						2014-07-31 18:24:51 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4cb88c940b 
							
						 
					 
					
						
						
							
							* Fix memory leak in tokenizer, caused by having a fixed vocab.  
						
						
						
					 
					
						2014-07-31 18:19:38 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							5b81ee716f 
							
						 
					 
					
						
						
							
							* Use a sparse_hash_map to store happax vocab items, with a max size.  
						
						
						
					 
					
						2014-07-31 17:40:43 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							b9016c4633 
							
						 
					 
					
						
						
							
							* Switch to using sparsehash and murmurhash libraries out of pip  
						
						
						
					 
					
						2014-07-25 15:47:27 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a895fe5ddb 
							
						 
					 
					
						
						
							
							* Upd from spacy  
						
						
						
					 
					
						2014-07-23 17:35:18 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							87bf205b82 
							
						 
					 
					
						
						
							
							* Fix open apostrophe bug  
						
						
						
					 
					
						2014-07-07 23:26:01 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							571808a274 
							
						 
					 
					
						
						
							
							Group-by seems to be working  
						
						
						
					 
					
						2014-07-07 20:27:02 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							80b36f9f27 
							
						 
					 
					
						
						
							
							* 710k words per second for counts  
						
						
						
					 
					
						2014-07-07 19:12:19 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							057c21969b 
							
						 
					 
					
						
						
							
							* Refactor for string view features. Working on setting up flags and enums.  
						
						
						
					 
					
						2014-07-07 16:58:48 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							f1bcbd4c4e 
							
						 
					 
					
						
						
							
							* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well.  
						
						
						
					 
					
						2014-07-07 12:47:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6668e44961 
							
						 
					 
					
						
						
							
							* Whitespace  
						
						
						
					 
					
						2014-07-07 08:15:44 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0074ae2fc0 
							
						 
					 
					
						
						
							
							* Switch to dynamically allocating array, based on the document length  
						
						
						
					 
					
						2014-07-07 08:05:29 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							ff1869ff07 
							
						 
					 
					
						
						
							
							* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++  
						
						
						
					 
					
						2014-07-07 07:36:43 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							0c76143b72 
							
						 
					 
					
						
						
							
							* Give value for assert  
						
						
						
					 
					
						2014-07-07 05:10:46 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e244739dfe 
							
						 
					 
					
						
						
							
							* Fix ptb tokenization  
						
						
						
					 
					
						2014-07-07 05:10:09 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							dc20500920 
							
						 
					 
					
						
						
							
							* Remove cpp files  
						
						
						
					 
					
						2014-07-07 05:09:05 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							25849fc926 
							
						 
					 
					
						
						
							
							* Generalize tokenization rules to capitals  
						
						
						
					 
					
						2014-07-07 05:07:21 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							df0458001d 
							
						 
					 
					
						
						
							
							* Begin work on full PTB-compatible English tokenization  
						
						
						
					 
					
						2014-07-07 04:29:24 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							d5bef02c72 
							
						 
					 
					
						
						
							
							* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals  
						
						
						
					 
					
						2014-07-07 04:21:06 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							a62c38e1ef 
							
						 
					 
					
						
						
							
							* Working tokenization. en doesn't match PTB perfectly. Need to reorganize before adding more schemes.  
						
						
						
					 
					
						2014-07-07 01:15:59 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							4e79446dc2 
							
						 
					 
					
						
						
							
							* Reading in tokenization rules correctly. Passing tests.  
						
						
						
					 
					
						2014-07-07 00:02:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							72159e7011 
							
						 
					 
					
						
						
							
							* Fixes to tokenization. Now segment sequences of the same punctuation.  
						
						
						
					 
					
						2014-07-06 19:28:42 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							e98e97d483 
							
						 
					 
					
						
						
							
							* Possessive test passing  
						
						
						
					 
					
						2014-07-06 18:35:55 +02:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							556f6a18ca 
							
						 
					 
					
						
						
							
							* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc.  
						
						
						
					 
					
						2014-07-05 20:51:42 +02:00