Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							e49c7f1478
							
						
					 | 
					
						
						
							
							* Update oov check in tokenizer
						
						
						
						
						
					 | 
					
						2015-07-18 22:45:28 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							cfd842769e
							
						
					 | 
					
						
						
							
							* Allow infix tokens to be variable length
						
						
						
						
						
					 | 
					
						2015-07-18 22:45:00 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							3b5baa660f
							
						
					 | 
					
						
						
							
							* Fix tokenizer
						
						
						
						
						
					 | 
					
						2015-07-14 00:10:51 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							24d6ce99ec
							
						
					 | 
					
						
						
							
							* Add comment to tokenizer, explaining the spacy attr
						
						
						
						
						
					 | 
					
						2015-07-13 22:29:13 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							67641f3b58
							
						
					 | 
					
						
						
							
							* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string
						
						
						
						
						
					 | 
					
						2015-07-13 21:46:02 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							6eef0bf9ab
							
						
					 | 
					
						
						
							
							* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx
						
						
						
						
						
					 | 
					
						2015-07-13 20:20:58 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							bb522496dd
							
						
					 | 
					
						
						
							
							* Rename Tokens to Doc
						
						
						
						
						
					 | 
					
						2015-07-08 18:53:00 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							935bcdf3e5
							
						
					 | 
					
						
						
							
							* Remove redundant tag_names argument to Tokenizer
						
						
						
						
						
					 | 
					
						2015-07-08 12:36:04 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							2d0e99a096
							
						
					 | 
					
						
						
							
							* Pass pos_tags into Tokenizer.from_dir
						
						
						
						
						
					 | 
					
						2015-07-07 14:23:08 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							6788c86b2f
							
						
					 | 
					
						
						
							
							* Begin refactor
						
						
						
						
						
					 | 
					
						2015-07-07 14:00:07 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							98cfd84123
							
						
					 | 
					
						
						
							
							* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work
						
						
						
						
						
					 | 
					
						2015-06-06 05:57:03 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							20f1d868a3
							
						
					 | 
					
						
						
							
							* Tmp commit. Working on whole document parsing
						
						
						
						
						
					 | 
					
						2015-05-24 02:49:56 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Jordan Suchow
							
						 
					 | 
					
						
						
						
						
							
						
						
							3a8d9b37a6
							
						
					 | 
					
						
						
							
							Remove trailing whitespace
						
						
						
						
						
					 | 
					
						2015-04-19 13:01:38 -07:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							f02c39dfaf
							
						
					 | 
					
						
						
							
							* Compare to is not None, for more robustness
						
						
						
						
						
					 | 
					
						2015-03-26 16:44:48 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							7237c805c7
							
						
					 | 
					
						
						
							
							* Load tag for specials.json token
						
						
						
						
						
					 | 
					
						2015-03-26 16:44:46 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							0492cee8b4
							
						
					 | 
					
						
						
							
							* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens
						
						
						
						
						
					 | 
					
						2015-02-08 18:30:30 -05:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							4ff180db74
							
						
					 | 
					
						
						
							
							* Fix off-by-one error in commit 0a7fceb
						
						
						
						
						
					 | 
					
						2015-01-30 12:49:33 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							0a7fcebdf7
							
						
					 | 
					
						
						
							
							* Fix Issue #12: Incorrect token.idx calculations for some punctuation, in the presence of token cache
						
						
						
						
						
					 | 
					
						2015-01-30 12:33:38 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							5928d158ce
							
						
					 | 
					
						
						
							
							* Pass the string to Tokens
						
						
						
						
						
					 | 
					
						2015-01-22 02:04:58 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							6c7e44140b
							
						
					 | 
					
						
						
							
							* Work on word vectors, and other stuff
						
						
						
						
						
					 | 
					
						2015-01-17 16:21:17 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ce2edd6312
							
						
					 | 
					
						
						
							
							* Tmp commit. Refactoring to create a Python Lexeme class.
						
						
						
						
						
					 | 
					
						2015-01-12 10:26:22 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							3f1944d688
							
						
					 | 
					
						
						
							
							* Make PyPy work
						
						
						
						
						
					 | 
					
						2015-01-05 17:54:38 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							9976aa976e
							
						
					 | 
					
						
						
							
							* Messily fix morphology and POS tags on special tokens.
						
						
						
						
						
					 | 
					
						2014-12-30 23:24:37 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							4c4aa2c5c9
							
						
					 | 
					
						
						
							
							* Work on train
						
						
						
						
						
					 | 
					
						2014-12-22 07:25:43 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							e1c1a4b868
							
						
					 | 
					
						
						
							
							* Tmp
						
						
						
						
						
					 | 
					
						2014-12-21 05:36:29 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							be1bdcbd85
							
						
					 | 
					
						
						
							
							* Move lang.pyx to tokenizer.pyx
						
						
						
						
						
					 | 
					
						2014-12-20 07:55:40 +11:00 | 
					
					
						
						
							
							
							
						
					 |