spaCy/spacy/lang/el/lemmatizer/get_pos_from_wiktionary.py

# coding: utf8
from __future__ import unicode_literals


def get_pos_from_wiktionary():
    import re
    from gensim.corpora.wikicorpus import extract_pages

    regex = re.compile(r"==={{(\w+)\|el}}===")
    regex2 = re.compile(r"==={{(\w+ \w+)\|el}}===")

    # get words based on the Wiktionary dump
    # check only for specific parts

    # ==={{κύριο όνομα|el}}===
    expected_parts = [
        "μετοχή",
        "ρήμα",
        "επίθετο",
        "επίρρημα",
        "ουσιαστικό",
        "κύριο όνομα",
        "άρθρο",
    ]

    wiktionary_file_path = (
        "/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml"
    )

    proper_names_dict = {
        "ουσιαστικό": "nouns",
        "επίθετο": "adjectives",
        "άρθρο": "dets",
        "επίρρημα": "adverbs",
        "κύριο όνομα": "proper_names",
        "μετοχή": "participles",
        "ρήμα": "verbs",
    }
    expected_parts_dict = {}
    for expected_part in expected_parts:
        expected_parts_dict[expected_part] = []

    for title, text, pageid in extract_pages(wiktionary_file_path):
        if text.startswith("#REDIRECT"):
            continue
        title = title.lower()
        all_regex = regex.findall(text)
        all_regex.extend(regex2.findall(text))
        for a in all_regex:
            if a in expected_parts:
                expected_parts_dict[a].append(title)

    for i in expected_parts_dict:
        with open("_{0}.py".format(proper_names_dict[i]), "w") as f:
            f.write("from __future__ import unicode_literals\n")
            f.write('{} = set("""\n'.format(proper_names_dict[i].upper()))
            words = sorted(expected_parts_dict[i])
            line = ""
            to_write = []
            for word in words:
                if len(line + " " + word) > 79:
                    to_write.append(line)
                    line = ""
                else:
                    line = line + " " + word
            f.write("\n".join(to_write))
            f.write('\n""".split())')