Ben Eyal
							
						 
					 | 
					
						
						
						
						
							
						
						
							33af52599e
							
						
					 | 
					
						
						
							
							Redefine alphabetic characters
						
						
						
						
						
						
						
						For caseless languages (Hebrew, Bengali) all characters are both lowercase and uppercase. 
						
					 | 
					
						2017-04-20 02:25:02 +03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ben Eyal
							
						 
					 | 
					
						
						
						
						
							
						
						
							d8098a8be2
							
						
					 | 
					
						
						
							
							Use regex instead of re
						
						
						
						
						
					 | 
					
						2017-04-20 02:22:52 +03:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								ines
							
						 
					 | 
					
						
						
						
						
							
						
						
							bf0f15e762
							
						
					 | 
					
						
						
							
							Add / to tokenizer infixes (resolves #891)
						
						
						
						
						
					 | 
					
						2017-04-07 17:30:44 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								ines
							
						 
					 | 
					
						
						
						
						
							
						
						
							66c1f194f9
							
						
					 | 
					
						
						
							
							Use consistent unicode declarations
						
						
						
						
						
					 | 
					
						2017-03-12 13:07:28 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							012f4820cb
							
						
					 | 
					
						
						
							
							Keep infixes of punctuation + hyphens as one token (see #801)
						
						
						
						
						
					 | 
					
						2017-02-02 16:22:40 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							1219a5f513
							
						
					 | 
					
						
						
							
							Add = to tokenizer prefixes
						
						
						
						
						
					 | 
					
						2017-02-02 16:21:11 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							116c675c3c
							
						
					 | 
					
						
						
							
							Merge pull request #742 from oroszgy/hu_tokenizer_fix
						
						
						
						
						
						
						
						Improved Hungarian tokenizer 
						
					 | 
					
						2017-01-14 23:52:44 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gyorgy Orosz
							
						 
					 | 
					
						
						
						
						
							
						
						
							63037e79af
							
						
					 | 
					
						
						
							
							Fixed hyphen handling in the Hungarian tokenizer.
						
						
						
						
						
					 | 
					
						2017-01-14 16:30:11 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Gyorgy Orosz
							
						 
					 | 
					
						
						
						
						
							
						
						
							1be5da1ac6
							
						
					 | 
					
						
						
							
							Fixed Hungarian tokenizer for numbers
						
						
						
						
						
					 | 
					
						2017-01-14 15:51:59 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							0894b8c0ef
							
						
					 | 
					
						
						
							
							Don't split tokens with digits and "/" infixes (resolves #740)
						
						
						
						
						
					 | 
					
						2017-01-12 22:58:26 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							eef94e3ee2
							
						
					 | 
					
						
						
							
							Split off period after two or more uppercase letters (fixes #483)
						
						
						
						
						
					 | 
					
						2017-01-08 22:28:25 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							347c4a2d06
							
						
					 | 
					
						
						
							
							Reorganise and reformat global tokenizer prefixes, suffixes and infixes
						
						
						
						
						
					 | 
					
						2017-01-08 20:37:39 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							eaa3b1319d
							
						
					 | 
					
						
						
							
							Fix formatting
						
						
						
						
						
					 | 
					
						2016-12-18 15:36:53 +01:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								Ines Montani
							
						 
					 | 
					
						
						
						
						
							
						
						
							e47ee94761
							
						
					 | 
					
						
						
							
							Split punctuation into its own file
						
						
						
						
						
					 | 
					
						2016-12-08 19:46:43 +01:00 | 
					
					
						
						
							
							
							
						
					 |