-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathword2vec.rb
More file actions
executable file
·137 lines (127 loc) · 3.96 KB
/
word2vec.rb
File metadata and controls
executable file
·137 lines (127 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
require 'optparse'
#require 'byebug'
#require 'stackprof'
#StackProf.run(mode: :cpu, out: 'example/stackprof-cpu-myapp.dump') do
MAX_EXP = 6
EXP_TABLE_SIZE = 1000
MAX_SENTENCE_LENGTH = 1000
VocabWord = Struct.new(:cn, :point, :word, :code, :codelen)
class Util
def initialize(seed=1)
@next_random = seed
end
def next_random
@next_random = (@next_random * 25214903917 + 11) & 0xffffffffffffffff
return @next_random
end
end
params = ARGV.getopts('h:','binary:0','sample:1e-3', 'size:5', 'iter:5', 'window:5', 'min_count:5', 'negative:5', 'debug:0', 'train:', 'output:')
abort "--train or --output is not specified" if params['train'].nil? || params['output'].nil?
neu1 = []
vocab = []
layer1_size = params['size'].to_i
train_words = 0
iter = params['iter'].to_i
debug_mode = params['debug'].to_i
window = params['window'].to_i
min_count = params['min_count'].to_i
syn0 = []
syn1 = []
syn1neg = []
negative = params['negative'].to_i
original_text = []
binary = params['binary'].to_i
cbow = 1
if cbow then
alpha = 0.05
else
alpha = 0.025
end
starting_alpha = alpha
sample = params['sample'].to_f
table_size=1e8
table = []
__cum_table = []
__vocab_index_hash = {}
File.open(params['train']){|f|
while line = f.gets
line.split(" ").each do |word|
original_text.push word
if __vocab_index_hash.key?(word) then
vocab[__vocab_index_hash[word]].cn += 1
else
vocab.push VocabWord.new(1, 0, word, 0, 0)
__vocab_index_hash[word] = vocab.size-1
end
printf "\r%dK" % original_text.size if debug_mode > 1
end
end
}
vocab.sort! {|a,b| b.cn <=> a.cn}
vocab.select! {|v| v.cn >= min_count}
train_words = vocab.inject(0) {|sum, v| sum + v.cn}
vocab.unshift VocabWord.new(1, 0, '</s>', 0, 0)
__vocab_index_hash = Hash[vocab.map.with_index {|v, i| [v.word,i]}.to_a]
if negative>0 then
syn1neg = Array.new(vocab.size*layer1_size, 0.0)
# InitUnigramTable
# naive implementation is below, but as python do it easier https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py#L282
# we'll use cumulative-distribution table as well.
power = 0.75
train_words_pow = 0
for a in 0..vocab.size-1 do
train_words_pow += vocab[a].cn**power
end
table_size = 1e8
#i = 0
#d1 = vocab[i].cn**power / train_words_pow
#for a in 0..table_size-1 do
# table[a] = i
# if a / table_size.to_f > d1 then
# i += 1
# d1 += vocab[i].cn ** power / train_words_pow
# end
# i = vocab.size - 1 if i >= vocab.size
#end
cumulative = 0.0
for a in 0..vocab.size-1 do
cumulative += vocab[a].cn ** power / train_words_pow
__cum_table[a] = (cumulative*table_size).round
end
end
util = Util.new(1)
syn0 = (0..vocab.size*layer1_size).map {|i| ((util.next_random & 0xFFFF).to_f / 65536 -0.5 ) / layer1_size }
puts "vocab.size=#{vocab.size}, layer1_size=#{layer1_size}, watashi-ha-#{syn0.size}"
vocab = vocab.map{|e|
e=[e.cn, e.word]
}
if defined?(VirtualModule)
class FloatArray < Array
end
syn0 = FloatArray.new(syn0)
__cum_table = FloatArray.new(__cum_table)
syn1neg = FloatArray.new(syn1neg)
vm = VirtualModule.new(methods:File.read(File.dirname(__FILE__)+"/calc.rb"))
vm.virtual_module_eval("calc_vec(iter, original_text, sample, train_words, debug_mode, __vocab_index_hash, vocab, syn0, syn1neg, negative, alpha, __cum_table, table_size, layer1_size, window)")
else
require "#{File.dirname(__FILE__)}/calc.rb"
calc_vec(iter, original_text, sample, train_words, debug_mode, __vocab_index_hash, vocab, syn0, syn1neg, negative, alpha, __cum_table, table_size, layer1_size, window)
end
out = sprintf("%d %d\n", vocab.size, layer1_size)
for a in 0..vocab.size-1 do
out += sprintf("%s ", vocab[a][1])
for b in 0.. layer1_size-1 do
if binary==0 then
out += sprintf("%f ", syn0[a*layer1_size + b])
else
out += [syn0[a*layer1_size + b]].pack("f*")
end
end
out += sprintf("\n")
end
if binary==0 then
File.write(params['output'], out)
else
File.binwrite(params['output'], out)
end
#end