svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							68a0662019 
							
						 
					 
					
						
						
							
							context encoder with Tok2Vec + linking model instead of cosine  
						
						
						
					 
					
						2019-06-28 08:29:31 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							dbc53b9870 
							
						 
					 
					
						
						
							
							rename to KBEntryC  
						
						
						
					 
					
						2019-06-26 15:55:26 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							1de61f68d6 
							
						 
					 
					
						
						
							
							improve speed of prediction loop  
						
						
						
					 
					
						2019-06-26 13:53:10 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							bee23cd8af 
							
						 
					 
					
						
						
							
							try Tok2Vec instead of SpacyVectors  
						
						
						
					 
					
						2019-06-25 16:09:22 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b58bace84b 
							
						 
					 
					
						
						
							
							small fixes  
						
						
						
					 
					
						2019-06-24 10:55:04 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							a31648d28b 
							
						 
					 
					
						
						
							
							further code cleanup  
						
						
						
					 
					
						2019-06-19 09:15:43 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							478305cd3f 
							
						 
					 
					
						
						
							
							small tweaks and documentation  
						
						
						
					 
					
						2019-06-18 18:38:09 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							0d177c1146 
							
						 
					 
					
						
						
							
							clean up code, remove old code, move to bin  
						
						
						
					 
					
						2019-06-18 13:20:40 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							ffae7d3555 
							
						 
					 
					
						
						
							
							sentence encoder only (removing article/mention encoder)  
						
						
						
					 
					
						2019-06-18 00:05:47 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6332af40de 
							
						 
					 
					
						
						
							
							baseline performances: oracle KB, random and prior prob  
						
						
						
					 
					
						2019-06-17 14:39:40 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							24db1392b9 
							
						 
					 
					
						
						
							
							reprocessing all of wikipedia for training data  
						
						
						
					 
					
						2019-06-16 21:14:45 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							81731907ba 
							
						 
					 
					
						
						
							
							performance per entity type  
						
						
						
					 
					
						2019-06-14 19:55:46 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b312f2d0e7 
							
						 
					 
					
						
						
							
							redo training data to be independent of KB and entity-level instead of doc-level  
						
						
						
					 
					
						2019-06-14 15:55:26 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							0b04d142de 
							
						 
					 
					
						
						
							
							regenerating KB  
						
						
						
					 
					
						2019-06-13 22:32:56 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							78dd3e11da 
							
						 
					 
					
						
						
							
							write entity linking pipe to file and keep vocab consistent between kb and nlp  
						
						
						
					 
					
						2019-06-13 16:25:39 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b12001f368 
							
						 
					 
					
						
						
							
							small fixes  
						
						
						
					 
					
						2019-06-12 22:05:53 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6521cfa132 
							
						 
					 
					
						
						
							
							speeding up training  
						
						
						
					 
					
						2019-06-12 13:37:05 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							66813a1fdc 
							
						 
					 
					
						
						
							
							speed up predictions  
						
						
						
					 
					
						2019-06-11 14:18:20 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							fe1ed432ef 
							
						 
					 
					
						
						
							
							eval on dev set, varying combo's of prior and context scores  
						
						
						
					 
					
						2019-06-11 11:40:58 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							83dc7b46fd 
							
						 
					 
					
						
						
							
							first tests with EL pipe  
						
						
						
					 
					
						2019-06-10 21:25:26 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							7de1ee69b8 
							
						 
					 
					
						
						
							
							training loop in proper pipe format  
						
						
						
					 
					
						2019-06-07 15:55:10 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							0486ccabfd 
							
						 
					 
					
						
						
							
							introduce goldparse.links  
						
						
						
					 
					
						2019-06-07 13:54:45 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							a5c061f506 
							
						 
					 
					
						
						
							
							storing NEL training data in GoldParse objects  
						
						
						
					 
					
						2019-06-07 12:58:42 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							61f0e2af65 
							
						 
					 
					
						
						
							
							code cleanup  
						
						
						
					 
					
						2019-06-06 20:22:14 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							d8b435ceff 
							
						 
					 
					
						
						
							
							pretraining description vectors and storing them in the KB  
						
						
						
					 
					
						2019-06-06 19:51:27 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							5c723c32c3 
							
						 
					 
					
						
						
							
							entity vectors in the KB + serialization of them  
						
						
						
					 
					
						2019-06-05 18:29:18 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9abbd0899f 
							
						 
					 
					
						
						
							
							separate entity encoder to get 64D descriptions  
						
						
						
					 
					
						2019-06-05 00:09:46 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							fb37cdb2d3 
							
						 
					 
					
						
						
							
							implementing el pipe in pipes.pyx (not tested yet)  
						
						
						
					 
					
						2019-06-03 21:32:54 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							d83a1e3052 
							
						 
					 
					
						
						
							
							Merge branch 'master' into feature/nel-wiki  
						
						
						
					 
					
						2019-06-03 09:35:10 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9e88763dab 
							
						 
					 
					
						
						
							
							60% acc run  
						
						
						
					 
					
						2019-06-03 08:04:49 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							268a52ead7 
							
						 
					 
					
						
						
							
							experimenting with cosine sim for negative examples (not OK yet)  
						
						
						
					 
					
						2019-05-29 16:07:53 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							a761929fa5 
							
						 
					 
					
						
						
							
							context encoder combining sentence and article  
						
						
						
					 
					
						2019-05-28 18:14:49 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							992fa92b66 
							
						 
					 
					
						
						
							
							refactor again to clusters of entities and cosine similarity  
						
						
						
					 
					
						2019-05-28 00:05:22 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							8c4aa076bc 
							
						 
					 
					
						
						
							
							small fixes  
						
						
						
					 
					
						2019-05-27 14:29:38 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							cfc27d7ff9 
							
						 
					 
					
						
						
							
							using Tok2Vec instead  
						
						
						
					 
					
						2019-05-26 23:39:46 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							abf9af81c9 
							
						 
					 
					
						
						
							
							learn rate en epochs  
						
						
						
					 
					
						2019-05-24 22:04:25 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							86ed771e0b 
							
						 
					 
					
						
						
							
							adding local sentence encoder  
						
						
						
					 
					
						2019-05-23 16:59:11 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							4392c01b7b 
							
						 
					 
					
						
						
							
							obtain sentence for each mention  
						
						
						
					 
					
						2019-05-23 15:37:05 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							97241a3ed7 
							
						 
					 
					
						
						
							
							upsampling and batch processing  
						
						
						
					 
					
						2019-05-22 23:40:10 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							1a16490d20 
							
						 
					 
					
						
						
							
							update per entity  
						
						
						
					 
					
						2019-05-22 12:46:40 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							eb08bdb11f 
							
						 
					 
					
						
						
							
							hidden with for encoders  
						
						
						
					 
					
						2019-05-21 23:42:46 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							7b13e3d56f 
							
						 
					 
					
						
						
							
							undersampling negatives  
						
						
						
					 
					
						2019-05-21 18:35:10 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							2fa3fac851 
							
						 
					 
					
						
						
							
							fix concat bp and more efficient batch calls  
						
						
						
					 
					
						2019-05-21 13:43:59 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							0a15ee4541 
							
						 
					 
					
						
						
							
							fix in bp call  
						
						
						
					 
					
						2019-05-20 23:54:55 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							89e322a637 
							
						 
					 
					
						
						
							
							small fixes  
						
						
						
					 
					
						2019-05-20 17:20:39 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							7edb2e1711 
							
						 
					 
					
						
						
							
							fix convolution layer  
						
						
						
					 
					
						2019-05-20 11:58:48 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							dd691d0053 
							
						 
					 
					
						
						
							
							debugging  
						
						
						
					 
					
						2019-05-17 17:44:11 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							400b19353d 
							
						 
					 
					
						
						
							
							simplify architecture and larger-scale test runs  
						
						
						
					 
					
						2019-05-17 01:51:18 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							d51bffe63b 
							
						 
					 
					
						
						
							
							clean up code  
						
						
						
					 
					
						2019-05-16 18:36:15 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b5470f3d75 
							
						 
					 
					
						
						
							
							various tests, architectures and experiments  
						
						
						
					 
					
						2019-05-16 18:25:34 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9ffe5437ae 
							
						 
					 
					
						
						
							
							calculate gradient for entity encoding  
						
						
						
					 
					
						2019-05-15 02:23:08 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							2713abc651 
							
						 
					 
					
						
						
							
							implement loss function using dot product and prob estimate per candidate cluster  
						
						
						
					 
					
						2019-05-14 22:55:56 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							09ed446b20 
							
						 
					 
					
						
						
							
							different architecture / settings  
						
						
						
					 
					
						2019-05-14 08:37:52 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							4142e8dd1b 
							
						 
					 
					
						
						
							
							train and predict per article (saving time for doc encoding)  
						
						
						
					 
					
						2019-05-13 17:02:34 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							3b81b00954 
							
						 
					 
					
						
						
							
							evaluating on dev set during training  
						
						
						
					 
					
						2019-05-13 14:26:04 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b6d788064a 
							
						 
					 
					
						
						
							
							some first experiments with different architectures and metrics  
						
						
						
					 
					
						2019-05-10 12:53:14 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9d089c0410 
							
						 
					 
					
						
						
							
							grouping clusters of instances per doc+mention  
						
						
						
					 
					
						2019-05-09 18:11:49 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							c6ca8649d7 
							
						 
					 
					
						
						
							
							first stab at model - not functional yet  
						
						
						
					 
					
						2019-05-09 17:23:19 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9f33732b96 
							
						 
					 
					
						
						
							
							using entity descriptions and article texts as input embedding vectors for training  
						
						
						
					 
					
						2019-05-07 16:03:42 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							7e348d7f7f 
							
						 
					 
					
						
						
							
							baseline evaluation using highest-freq candidate  
						
						
						
					 
					
						2019-05-06 15:13:50 +02:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							dd153b2b33 
							
						 
					 
					
						
						
							
							Simplify helper (see  #3681 ) [ci skip]  
						
						
						
					 
					
						2019-05-06 15:13:10 +02:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							f8fce6c03c 
							
						 
					 
					
						
						
							
							Fix typo (see  #3681 )  
						
						
						
					 
					
						2019-05-06 15:02:11 +02:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							f2a56c1b56 
							
						 
					 
					
						
						
							
							Rewrite example to use Retokenizer ( resolves   #3681 )  
						
						... 
						
						
						
						Also add helper to filter spans 
						
					 
					
						2019-05-06 14:51:18 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6961215578 
							
						 
					 
					
						
						
							
							refactor code to separate functionality into different files  
						
						
						
					 
					
						2019-05-06 10:56:56 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							f5190267e7 
							
						 
					 
					
						
						
							
							run only 100M of WP data as training dataset (9%)  
						
						
						
					 
					
						2019-05-03 18:09:09 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							4e929600e5 
							
						 
					 
					
						
						
							
							fix WP id parsing, speed up processing and remove ambiguous strings in one doc (for now)  
						
						
						
					 
					
						2019-05-03 17:37:47 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							34600c92bd 
							
						 
					 
					
						
						
							
							try catch per article to ensure the pipeline goes on  
						
						
						
					 
					
						2019-05-03 15:10:09 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							bbcb9da466 
							
						 
					 
					
						
						
							
							creating training data with clean WP texts and QID entities true/false  
						
						
						
					 
					
						2019-05-03 10:44:29 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							cba9680d13 
							
						 
					 
					
						
						
							
							run NER on clean WP text and link to gold-standard entity IDs  
						
						
						
					 
					
						2019-05-02 17:24:52 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							581dc9742d 
							
						 
					 
					
						
						
							
							parsing clean text from WP articles to use as input data for NER and NEL  
						
						
						
					 
					
						2019-05-02 17:09:56 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							8353552191 
							
						 
					 
					
						
						
							
							cleanup  
						
						
						
					 
					
						2019-05-01 23:26:16 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							1ae41daaa9 
							
						 
					 
					
						
						
							
							allow small rounding errors  
						
						
						
					 
					
						2019-05-01 23:05:40 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							3629a52ede 
							
						 
					 
					
						
						
							
							reading all persons in wikidata  
						
						
						
					 
					
						2019-05-01 01:00:59 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							60b54ae8ce 
							
						 
					 
					
						
						
							
							bulk entity writing and experiment with regex wikidata reader to speed up processing  
						
						
						
					 
					
						2019-05-01 00:00:38 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							653b7d9c87 
							
						 
					 
					
						
						
							
							calculate entity raw counts offline to speed up KB construction  
						
						
						
					 
					
						2019-04-30 11:39:42 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							19e8f339cb 
							
						 
					 
					
						
						
							
							deduce entity freq from WP corpus and serialize vocab in WP test  
						
						
						
					 
					
						2019-04-29 17:37:29 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							54d0cea062 
							
						 
					 
					
						
						
							
							unit test for KB serialization  
						
						
						
					 
					
						2019-04-24 23:52:34 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							3e0cb69065 
							
						 
					 
					
						
						
							
							KB aliases to and from file  
						
						
						
					 
					
						2019-04-24 20:24:24 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							ad6c5e581c 
							
						 
					 
					
						
						
							
							writing and reading number of entries to/from header  
						
						
						
					 
					
						2019-04-24 15:31:44 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6e3223f234 
							
						 
					 
					
						
						
							
							bulk loading in proper order of entity indices  
						
						
						
					 
					
						2019-04-24 11:26:38 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							694fea597a 
							
						 
					 
					
						
						
							
							dumping all entryC entries + (inefficient) reading back in  
						
						
						
					 
					
						2019-04-23 18:36:50 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							8e70a564f1 
							
						 
					 
					
						
						
							
							custom reader and writer for _EntryC fields (first stab at it - not complete)  
						
						
						
					 
					
						2019-04-23 16:33:40 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							004e5e7d1c 
							
						 
					 
					
						
						
							
							little fixes  
						
						
						
					 
					
						2019-04-19 14:24:02 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9a8197185b 
							
						 
					 
					
						
						
							
							fix alias capitalization  
						
						
						
					 
					
						2019-04-18 22:37:50 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9f308eb5dc 
							
						 
					 
					
						
						
							
							fixes for prior prob and linking wikidata IDs with wikipedia titles  
						
						
						
					 
					
						2019-04-18 16:14:25 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							10ee8dfea2 
							
						 
					 
					
						
						
							
							poc with few entities and collecting aliases from the WP links  
						
						
						
					 
					
						2019-04-18 14:12:17 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6763e025e1 
							
						 
					 
					
						
						
							
							parse wp dump for links to determine prior probabilities  
						
						
						
					 
					
						2019-04-15 11:41:57 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							3163331b1e 
							
						 
					 
					
						
						
							
							wikipedia dump parser and mediawiki format regex cleanup  
						
						
						
					 
					
						2019-04-14 21:52:01 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							b31a390a9a 
							
						 
					 
					
						
						
							
							reading types, claims and sitelinks  
						
						
						
					 
					
						2019-04-11 21:42:44 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							6e997be4b4 
							
						 
					 
					
						
						
							
							reading wikidata descriptions and aliases  
						
						
						
					 
					
						2019-04-11 21:08:22 +02:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9a7d534b1b 
							
						 
					 
					
						
						
							
							enable nogil for cython functions in kb.pxd  
						
						
						
					 
					
						2019-04-10 17:25:10 +02:00 
						 
				 
			
				
					
						
							
							
								Ines Montani 
							
						 
					 
					
						
						
						
						
							
						
						
							24cecdb44f 
							
						 
					 
					
						
						
							
							Update compatibility [ci skip]  
						
						
						
					 
					
						2019-04-01 16:25:16 +02:00 
						 
				 
			
				
					
						
							
							
								Sofie 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							a4a6bfa4e1 
							
						 
					 
					
						
						
							
							Merge branch 'master' into feature/el-framework  
						
						
						
					 
					
						2019-03-26 11:00:02 +01:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							8814b9010d 
							
						 
					 
					
						
						
							
							entity as one field instead of both ID and name  
						
						
						
					 
					
						2019-03-25 18:10:41 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
						
						
							
						
						
							6c783f8045 
							
						 
					 
					
						
						
							
							Bug fixes and options for TextCategorizer ( #3472 )  
						
						... 
						
						
						
						* Fix code for bag-of-words feature extraction
The _ml.py module had a redundant copy of a function to extract unigram
bag-of-words features, except one had a bug that set values to 0.
Another function allowed extraction of bigram features. Replace all three
with a new function that supports arbitrary ngram sizes and also allows
control of which attribute is used (e.g. ORTH, LOWER, etc).
* Support 'bow' architecture for TextCategorizer
This allows efficient ngram bag-of-words models, which are better when
the classifier needs to run quickly, especially when the texts are long.
Pass architecture="bow" to use it. The extra arguments ngram_size and
attr are also available, e.g. ngram_size=2 means unigram and bigram
features will be extracted.
* Fix size limits in train_textcat example
* Explain architectures better in docs 
						
					 
					
						2019-03-23 16:44:44 +01:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							9de9900510 
							
						 
					 
					
						
						
							
							adding future import unicode literals to .py files  
						
						
						
					 
					
						2019-03-22 16:18:04 +01:00 
						 
				 
			
				
					
						
							
							
								Matthew Honnibal 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							4c5f265884 
							
						 
					 
					
						
						
							
							Fix train loop for train_textcat example  
						
						
						
					 
					
						2019-03-22 16:10:11 +01:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							5318ce88fa 
							
						 
					 
					
						
						
							
							'entity_linker' instead of 'el'  
						
						
						
					 
					
						2019-03-22 13:55:10 +01:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							a48241e9a2 
							
						 
					 
					
						
						
							
							use nlp's vocab for stringstore  
						
						
						
					 
					
						2019-03-22 11:36:45 +01:00 
						 
				 
			
				
					
						
							
							
								svlandeg 
							
						 
					 
					
						
						
						
						
							
						
						
							1ee0e78fd7 
							
						 
					 
					
						
						
							
							select candidate with highest prior probabiity  
						
						
						
					 
					
						2019-03-22 11:36:45 +01:00