svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							b58bace84b
							
						
					 | 
					
						
						
							
							small fixes
						
						
						
						
						
					 | 
					
						2019-06-24 10:55:04 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							a31648d28b
							
						
					 | 
					
						
						
							
							further code cleanup
						
						
						
						
						
					 | 
					
						2019-06-19 09:15:43 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							478305cd3f
							
						
					 | 
					
						
						
							
							small tweaks and documentation
						
						
						
						
						
					 | 
					
						2019-06-18 18:38:09 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							0d177c1146
							
						
					 | 
					
						
						
							
							clean up code, remove old code, move to bin
						
						
						
						
						
					 | 
					
						2019-06-18 13:20:40 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							6961215578
							
						
					 | 
					
						
						
							
							refactor code to separate functionality into different files
						
						
						
						
						
					 | 
					
						2019-05-06 10:56:56 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							f5190267e7
							
						
					 | 
					
						
						
							
							run only 100M of WP data as training dataset (9%)
						
						
						
						
						
					 | 
					
						2019-05-03 18:09:09 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							4e929600e5
							
						
					 | 
					
						
						
							
							fix WP id parsing, speed up processing and remove ambiguous strings in one doc (for now)
						
						
						
						
						
					 | 
					
						2019-05-03 17:37:47 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							34600c92bd
							
						
					 | 
					
						
						
							
							try catch per article to ensure the pipeline goes on
						
						
						
						
						
					 | 
					
						2019-05-03 15:10:09 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							bbcb9da466
							
						
					 | 
					
						
						
							
							creating training data with clean WP texts and QID entities true/false
						
						
						
						
						
					 | 
					
						2019-05-03 10:44:29 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							cba9680d13
							
						
					 | 
					
						
						
							
							run NER on clean WP text and link to gold-standard entity IDs
						
						
						
						
						
					 | 
					
						2019-05-02 17:24:52 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							581dc9742d
							
						
					 | 
					
						
						
							
							parsing clean text from WP articles to use as input data for NER and NEL
						
						
						
						
						
					 | 
					
						2019-05-02 17:09:56 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							8353552191
							
						
					 | 
					
						
						
							
							cleanup
						
						
						
						
						
					 | 
					
						2019-05-01 23:26:16 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							1ae41daaa9
							
						
					 | 
					
						
						
							
							allow small rounding errors
						
						
						
						
						
					 | 
					
						2019-05-01 23:05:40 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							3629a52ede
							
						
					 | 
					
						
						
							
							reading all persons in wikidata
						
						
						
						
						
					 | 
					
						2019-05-01 01:00:59 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							60b54ae8ce
							
						
					 | 
					
						
						
							
							bulk entity writing and experiment with regex wikidata reader to speed up processing
						
						
						
						
						
					 | 
					
						2019-05-01 00:00:38 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							653b7d9c87
							
						
					 | 
					
						
						
							
							calculate entity raw counts offline to speed up KB construction
						
						
						
						
						
					 | 
					
						2019-04-30 11:39:42 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							19e8f339cb
							
						
					 | 
					
						
						
							
							deduce entity freq from WP corpus and serialize vocab in WP test
						
						
						
						
						
					 | 
					
						2019-04-29 17:37:29 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							54d0cea062
							
						
					 | 
					
						
						
							
							unit test for KB serialization
						
						
						
						
						
					 | 
					
						2019-04-24 23:52:34 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							3e0cb69065
							
						
					 | 
					
						
						
							
							KB aliases to and from file
						
						
						
						
						
					 | 
					
						2019-04-24 20:24:24 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							ad6c5e581c
							
						
					 | 
					
						
						
							
							writing and reading number of entries to/from header
						
						
						
						
						
					 | 
					
						2019-04-24 15:31:44 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							6e3223f234
							
						
					 | 
					
						
						
							
							bulk loading in proper order of entity indices
						
						
						
						
						
					 | 
					
						2019-04-24 11:26:38 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							694fea597a
							
						
					 | 
					
						
						
							
							dumping all entryC entries + (inefficient) reading back in
						
						
						
						
						
					 | 
					
						2019-04-23 18:36:50 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							8e70a564f1
							
						
					 | 
					
						
						
							
							custom reader and writer for _EntryC fields (first stab at it - not complete)
						
						
						
						
						
					 | 
					
						2019-04-23 16:33:40 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							004e5e7d1c
							
						
					 | 
					
						
						
							
							little fixes
						
						
						
						
						
					 | 
					
						2019-04-19 14:24:02 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							9a8197185b
							
						
					 | 
					
						
						
							
							fix alias capitalization
						
						
						
						
						
					 | 
					
						2019-04-18 22:37:50 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							9f308eb5dc
							
						
					 | 
					
						
						
							
							fixes for prior prob and linking wikidata IDs with wikipedia titles
						
						
						
						
						
					 | 
					
						2019-04-18 16:14:25 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							10ee8dfea2
							
						
					 | 
					
						
						
							
							poc with few entities and collecting aliases from the WP links
						
						
						
						
						
					 | 
					
						2019-04-18 14:12:17 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							6763e025e1
							
						
					 | 
					
						
						
							
							parse wp dump for links to determine prior probabilities
						
						
						
						
						
					 | 
					
						2019-04-15 11:41:57 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							3163331b1e
							
						
					 | 
					
						
						
							
							wikipedia dump parser and mediawiki format regex cleanup
						
						
						
						
						
					 | 
					
						2019-04-14 21:52:01 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							b31a390a9a
							
						
					 | 
					
						
						
							
							reading types, claims and sitelinks
						
						
						
						
						
					 | 
					
						2019-04-11 21:42:44 +02:00 | 
					
					
						
						
							
							
							
						
					 | 
				
			
				
					
						
							
							
								 
								svlandeg
							
						 
					 | 
					
						
						
						
						
							
						
						
							6e997be4b4
							
						
					 | 
					
						
						
							
							reading wikidata descriptions and aliases
						
						
						
						
						
					 | 
					
						2019-04-11 21:08:22 +02:00 | 
					
					
						
						
							
							
							
						
					 |