mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			64 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			64 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| def get_pos_from_wiktionary():
 | |
|     import re
 | |
|     from gensim.corpora.wikicorpus import extract_pages
 | |
| 
 | |
|     regex = re.compile(r"==={{(\w+)\|el}}===")
 | |
|     regex2 = re.compile(r"==={{(\w+ \w+)\|el}}===")
 | |
| 
 | |
|     # get words based on the Wiktionary dump
 | |
|     # check only for specific parts
 | |
| 
 | |
|     # ==={{κύριο όνομα|el}}===
 | |
|     expected_parts = [
 | |
|         "μετοχή",
 | |
|         "ρήμα",
 | |
|         "επίθετο",
 | |
|         "επίρρημα",
 | |
|         "ουσιαστικό",
 | |
|         "κύριο όνομα",
 | |
|         "άρθρο",
 | |
|     ]
 | |
| 
 | |
|     wiktionary_file_path = (
 | |
|         "/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml"
 | |
|     )
 | |
| 
 | |
|     proper_names_dict = {
 | |
|         "ουσιαστικό": "nouns",
 | |
|         "επίθετο": "adjectives",
 | |
|         "άρθρο": "dets",
 | |
|         "επίρρημα": "adverbs",
 | |
|         "κύριο όνομα": "proper_names",
 | |
|         "μετοχή": "participles",
 | |
|         "ρήμα": "verbs",
 | |
|     }
 | |
|     expected_parts_dict = {}
 | |
|     for expected_part in expected_parts:
 | |
|         expected_parts_dict[expected_part] = []
 | |
| 
 | |
|     for title, text, pageid in extract_pages(wiktionary_file_path):
 | |
|         if text.startswith("#REDIRECT"):
 | |
|             continue
 | |
|         title = title.lower()
 | |
|         all_regex = regex.findall(text)
 | |
|         all_regex.extend(regex2.findall(text))
 | |
|         for a in all_regex:
 | |
|             if a in expected_parts:
 | |
|                 expected_parts_dict[a].append(title)
 | |
| 
 | |
|     for i in expected_parts_dict:
 | |
|         with open("_{0}.py".format(proper_names_dict[i]), "w") as f:
 | |
|             f.write("from __future__ import unicode_literals\n")
 | |
|             f.write('{} = set("""\n'.format(proper_names_dict[i].upper()))
 | |
|             words = sorted(expected_parts_dict[i])
 | |
|             line = ""
 | |
|             to_write = []
 | |
|             for word in words:
 | |
|                 if len(line + " " + word) > 79:
 | |
|                     to_write.append(line)
 | |
|                     line = ""
 | |
|                 else:
 | |
|                     line = line + " " + word
 | |
|             f.write("\n".join(to_write))
 | |
|             f.write('\n""".split())')
 |