* Pass OOV probability around

2025-09-03 10:54:55 +03:00 · 2015-07-25 23:29:51 +02:00 · 2015-07-25 23:29:51 +02:00 · fd525f0675
commit fd525f0675
parent 5b6bf4d4a6
3 changed files with 19 additions and 16 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -110,8 +110,8 @@ def _read_freqs(loc):
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
-    probs['-OOV-'] = math.log(counts.smoother(0)) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs
+    return probs, oov_prob
 def _read_senses(loc):
@ -144,21 +144,20 @@ def setup_vocab(src_dir, dst_dir):
        print("Warning: Word vectors file not found")
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
-    probs = _read_probs(src_dir / 'words.sgt.prob')
+    probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
    if not probs:
-        probs = _read_freqs(src_dir / 'freqs.txt')
+        probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
    if not probs:
-        min_prob = 0.0
+        oov_prob = 0.0
    else:
-        min_prob = min(probs.values())
+        oov_prob = min(probs.values())
    for word in clusters:
        if word not in probs:
-            probs[word] = min_prob
+            probs[word] = oov_prob
    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters:
        entry['prob'] = float(prob)
        cluster = clusters.get(word, '0')
        # Decode as a little-endian string, so that we can do & 15 to get
@ -167,6 +166,8 @@ def setup_vocab(src_dir, dst_dir):
        vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
    with (dst_dir / 'oov_prob').open('w') as file_:
        file_.write('%f' % oov_prob)
 def main(lang_data_dir, corpora_dir, model_dir):
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -31,6 +31,7 @@ cdef class Vocab:
    cdef readonly int length
    cdef public object _serializer
    cdef public object data_dir
    cdef public float oov_prob
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -37,7 +37,7 @@ cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
    def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
-                 pos_tags=None):
+                 pos_tags=None, oov_prob=-30):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
@ -61,6 +61,7 @@ cdef class Vocab:
        self._serializer = None
        self.data_dir = data_dir
        self.oov_prob = oov_prob
    property serializer:
        def __get__(self):
@ -90,7 +91,7 @@ cdef class Vocab:
        if len(string) < 3:
            mem = self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        props = self.lexeme_props_getter(string)
+        props = self.lexeme_props_getter(string, self.oov_prob)
        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
        if is_oov:
            lex.id = 0