Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							f1c3e17c80
							
						
					 | 
					
						
						
							
							* Work on intro copy
						
						
						
						
						
					 | 
					
						2014-11-03 00:13:19 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							fa91506073
							
						
					 | 
					
						
						
							
							* Add '' double quote to suffixes file
						
						
						
						
						
					 | 
					
						2014-11-03 00:12:59 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							493d5ffb50
							
						
					 | 
					
						
						
							
							* Add test for '' in punct
						
						
						
						
						
					 | 
					
						2014-11-02 21:24:09 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							711ed0f636
							
						
					 | 
					
						
						
							
							* Whitespace
						
						
						
						
						
					 | 
					
						2014-11-02 14:22:32 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							fcd9490d56
							
						
					 | 
					
						
						
							
							* Add pos_tag method to Language
						
						
						
						
						
					 | 
					
						2014-11-02 14:21:43 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							99b5cefa88
							
						
					 | 
					
						
						
							
							* Add tests for emoticon tokenization
						
						
						
						
						
					 | 
					
						2014-11-02 13:22:14 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							23131f21bb
							
						
					 | 
					
						
						
							
							* Add tests for like_url
						
						
						
						
						
					 | 
					
						2014-11-02 13:21:57 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							dc6c3c0f56
							
						
					 | 
					
						
						
							
							* Add tests for like_number
						
						
						
						
						
					 | 
					
						2014-11-02 13:21:39 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							829bb2bdbe
							
						
					 | 
					
						
						
							
							* Add mappings to Twitter POS tag corpus
						
						
						
						
						
					 | 
					
						2014-11-02 13:21:19 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							437cd2217d
							
						
					 | 
					
						
						
							
							* Fix strings i/o, removing use of ujson library in favour of plain text file. Allows better control of codecs.
						
						
						
						
						
					 | 
					
						2014-11-02 13:20:37 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							3352e89e21
							
						
					 | 
					
						
						
							
							* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web
						
						
						
						
						
					 | 
					
						2014-11-02 13:19:54 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							8335706321
							
						
					 | 
					
						
						
							
							* Add LIKE_URL and LIKE_NUMBER flag features
						
						
						
						
						
					 | 
					
						2014-11-02 13:19:23 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							c414d0eebe
							
						
					 | 
					
						
						
							
							* Add tests for is_number
						
						
						
						
						
					 | 
					
						2014-11-01 19:13:40 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							5484fbea69
							
						
					 | 
					
						
						
							
							* Implement is_number
						
						
						
						
						
					 | 
					
						2014-11-01 19:13:24 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							f685218e21
							
						
					 | 
					
						
						
							
							* Add is_urlish function
						
						
						
						
						
					 | 
					
						2014-11-01 17:39:34 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							11e42fd070
							
						
					 | 
					
						
						
							
							* Add emoticons to tokenization
						
						
						
						
						
					 | 
					
						2014-11-01 15:14:55 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							39743323ea
							
						
					 | 
					
						
						
							
							* Add i'ma to tokenization rules
						
						
						
						
						
					 | 
					
						2014-10-31 17:45:44 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							09a3e54176
							
						
					 | 
					
						
						
							
							* Delete print statements from stringstore
						
						
						
						
						
					 | 
					
						2014-10-31 17:45:26 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							b186a66bae
							
						
					 | 
					
						
						
							
							* Rename Token.lex_pos to Token.postype, and Token.lex_supersense to Token.sensetype
						
						
						
						
						
					 | 
					
						2014-10-31 17:44:39 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							a8ca078b24
							
						
					 | 
					
						
						
							
							* Restore lexemes field to lexicon
						
						
						
						
						
					 | 
					
						2014-10-31 17:43:25 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							6c807aa45f
							
						
					 | 
					
						
						
							
							* Restore id attribute to lexeme, and rename pos field to postype, to store clustered tag dictionaries
						
						
						
						
						
					 | 
					
						2014-10-31 17:43:00 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							aaf6953fe0
							
						
					 | 
					
						
						
							
							* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web.
						
						
						
						
						
					 | 
					
						2014-10-31 17:42:15 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							f67cb9a5a3
							
						
					 | 
					
						
						
							
							* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web.
						
						
						
						
						
					 | 
					
						2014-10-31 17:42:04 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							63114820cf
							
						
					 | 
					
						
						
							
							* Upd tests for tighter interface
						
						
						
						
						
					 | 
					
						2014-10-30 18:15:30 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ea8f1e7053
							
						
					 | 
					
						
						
							
							* Tighten interfaces
						
						
						
						
						
					 | 
					
						2014-10-30 18:14:42 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ea85bf3a0a
							
						
					 | 
					
						
						
							
							* Tighten the interface to Language
						
						
						
						
						
					 | 
					
						2014-10-30 18:01:27 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							c6fcd03692
							
						
					 | 
					
						
						
							
							* Small efficiency tweak to lexeme init
						
						
						
						
						
					 | 
					
						2014-10-30 17:56:11 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							87c2418a89
							
						
					 | 
					
						
						
							
							* Fiddle with data types on Lexeme, to compress them to a much smaller size.
						
						
						
						
						
					 | 
					
						2014-10-30 15:42:15 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ac88893232
							
						
					 | 
					
						
						
							
							* Fix Token after lexeme changes
						
						
						
						
						
					 | 
					
						2014-10-30 15:30:52 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							e6b87766fe
							
						
					 | 
					
						
						
							
							* Remove lexemes vector from Lexicon, and the id and hash attributes from Lexeme
						
						
						
						
						
					 | 
					
						2014-10-30 15:21:38 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							889b7b48b4
							
						
					 | 
					
						
						
							
							* Fix POS tagger, so that it loads correctly. Lexemes are being read in.
						
						
						
						
						
					 | 
					
						2014-10-30 13:38:55 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							67c8c8019f
							
						
					 | 
					
						
						
							
							* Update lexeme serialization, using a binary file format
						
						
						
						
						
					 | 
					
						2014-10-30 01:01:00 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							13909a2e24
							
						
					 | 
					
						
						
							
							* Rewriting Lexeme serialization.
						
						
						
						
						
					 | 
					
						2014-10-29 23:19:38 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							234d49bf4d
							
						
					 | 
					
						
						
							
							* Seems to be working after refactor. Need to wire up more POS tag features, and wire up save/load of POS tags.
						
						
						
						
						
					 | 
					
						2014-10-24 02:23:42 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							08ce602243
							
						
					 | 
					
						
						
							
							* Large refactor, particularly to Python API
						
						
						
						
						
					 | 
					
						2014-10-24 00:59:17 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							168b2b8cb2
							
						
					 | 
					
						
						
							
							* Add tests for string intern
						
						
						
						
						
					 | 
					
						2014-10-23 20:47:06 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							7baef5b7ff
							
						
					 | 
					
						
						
							
							* Fix padding on tokens
						
						
						
						
						
					 | 
					
						2014-10-23 04:01:17 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							96b835a3d4
							
						
					 | 
					
						
						
							
							* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags.
						
						
						
						
						
					 | 
					
						2014-10-23 03:20:02 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							e5e951ae67
							
						
					 | 
					
						
						
							
							* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding.
						
						
						
						
						
					 | 
					
						2014-10-23 01:57:59 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ea1d4a81eb
							
						
					 | 
					
						
						
							
							* Refactoring get_atoms, improving tokens API
						
						
						
						
						
					 | 
					
						2014-10-22 13:10:56 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							ad49e2482e
							
						
					 | 
					
						
						
							
							* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text.
						
						
						
						
						
					 | 
					
						2014-10-22 12:57:06 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							0a0e41f6c8
							
						
					 | 
					
						
						
							
							* Add prefix and suffix features
						
						
						
						
						
					 | 
					
						2014-10-22 12:56:09 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							7018b53d3a
							
						
					 | 
					
						
						
							
							* Improve array features in tokens
						
						
						
						
						
					 | 
					
						2014-10-22 12:55:42 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							43d5964e13
							
						
					 | 
					
						
						
							
							* Add function to read detokenization rules
						
						
						
						
						
					 | 
					
						2014-10-22 12:54:59 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							077885637d
							
						
					 | 
					
						
						
							
							* Add test for reading in POS tags
						
						
						
						
						
					 | 
					
						2014-10-22 10:18:43 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							224bdae996
							
						
					 | 
					
						
						
							
							* Add POS utilities
						
						
						
						
						
					 | 
					
						2014-10-22 10:17:57 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							5ebe14f353
							
						
					 | 
					
						
						
							
							* Add greedy pos tagger
						
						
						
						
						
					 | 
					
						2014-10-22 10:17:26 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							12742f4f83
							
						
					 | 
					
						
						
							
							* Add detokenize method and test
						
						
						
						
						
					 | 
					
						2014-10-18 18:07:29 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							df110476d5
							
						
					 | 
					
						
						
							
							* Update docs
						
						
						
						
						
					 | 
					
						2014-10-15 21:50:34 +11:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Matthew Honnibal
							
						 
					 | 
					
						
						
						
						
							
						
						
							849de654e7
							
						
					 | 
					
						
						
							
							* Add file for infix patterns
						
						
						
						
						
					 | 
					
						2014-10-14 20:26:43 +11:00 | 
					
					
						
						
							
							
							
						
					 |