diff --git a/ICE/ICE.cpp b/ICE/ICE.cpp index 30f28d9..38ba524 100644 --- a/ICE/ICE.cpp +++ b/ICE/ICE.cpp @@ -18,7 +18,36 @@ void ICE::LoadDict(unordered_map>& graph) vvnet.LoadDict(graph); } -void ICE::SaveWeights(string model_name){ +void ICE::LoadWeights(string filename){ + vvnet.LoadWeights(filename, w_context); +} + +void ICE::SaveVertexWeights(string model_name){ + + cout << "Save Model:" << endl; + ofstream model(model_name); + if (model) + { + //model << vvnet.MAX_vid << " " << dim << endl; + for (auto k: vvnet.keys) + { + if (vvnet.vertex[vvnet.kmap[k]].branch > 0) + { + model << k; + for (int d=0; d" << endl; + } + else + { + cout << "\tfail to open file" << endl; + } +} + +void ICE::SaveContextWeights(string model_name){ cout << "Save Model:" << endl; ofstream model(model_name); @@ -27,6 +56,37 @@ void ICE::SaveWeights(string model_name){ //model << vvnet.MAX_vid << " " << dim << endl; for (auto k: vvnet.keys) { + if (vvnet.vertex[vvnet.kmap[k]].branch > 0) + { + model << k; + for (int d=0; d" << endl; + } + else + { + cout << "\tfail to open file" << endl; + } +} + +void ICE::SaveEntityWeights(string model_name){ + + cout << "Save Model:" << endl; + ofstream model(model_name); + int i = 0; + if (model) + { + //model << vvnet.MAX_vid << " " << dim << endl; + for (auto k: vvnet.keys) + { + // cout << k << endl; + if (i < vocab_count){ + i++; + continue; + } if (vvnet.vertex[vvnet.kmap[k]].branch > 0) { model << k; @@ -43,6 +103,10 @@ void ICE::SaveWeights(string model_name){ } } +int ICE::getVocabCount(){ + vocab_count = vvnet.keys.size(); +} + void ICE::Init(int dimension) { cout << "Model Setting:" << endl; @@ -70,7 +134,7 @@ void ICE::Init(int dimension) { } -void ICE::Train(int sample_times, int negative_samples, double alpha, double alpha_min, int workers){ +void ICE::TrainStage1(int sample_times, int negative_samples, double alpha, double alpha_min, int workers){ omp_set_num_threads(workers); @@ -103,6 +167,7 @@ void ICE::Train(int sample_times, int negative_samples, double alpha, double alp { v1 = vvnet.SourceSample(); v2 = vvnet.TargetSample(v1); + vvnet.UpdatePair(w_vertex, w_context, v1, v2, dim, negative_samples, _alpha); count++; @@ -121,3 +186,57 @@ void ICE::Train(int sample_times, int negative_samples, double alpha, double alp } +void ICE::TrainStage2(int sample_times, int negative_samples, double alpha, double alpha_min, int workers, int vocab_count){ + + omp_set_num_threads(workers); + + cout << "Model:" << endl; + cout << "\t[ICE]" << endl; + + cout << "Learning Parameters:" << endl; + cout << "\tsample_times:\t\t" << sample_times << " (*Million)" << endl; + cout << "\tnegative_samples:\t" << negative_samples << endl; + cout << "\talpha:\t\t\t" << alpha << endl; + cout << "\tworkers:\t\t" << workers << endl; + + cout << "Start Training:" << endl; + + unsigned long long total_sample_times = (unsigned long long)sample_times*1000000; + double alpha_last, alpha_reduce; + double _alpha = alpha; + alpha_reduce = (alpha-alpha_min)/(total_sample_times/MONITOR); + + unsigned long long current_sample = 0; + unsigned long long jobs = total_sample_times/workers; + + #pragma omp parallel for + for (int worker=0; worker= vocab_count){ + vvnet.UpdateVertex(w_vertex, w_context, v1, v2, dim, negative_samples, _alpha); + } + + count++; + if (count % MONITOR == 0) + { + _alpha -= alpha_reduce; + current_sample += MONITOR; + if (_alpha < alpha_min) _alpha = alpha_min; + alpha_last = _alpha; + printf("\tAlpha: %.6f\tProgress: %.3f %%%c", _alpha, (double)(current_sample)/total_sample_times * 100, 13); + fflush(stdout); + } + } + } + printf("\tAlpha: %.6f\tProgress: 100.00 %%\n", alpha_last); + +} + diff --git a/ICE/ICE.h b/ICE/ICE.h index 2191d40..0226c55 100644 --- a/ICE/ICE.h +++ b/ICE/ICE.h @@ -18,18 +18,24 @@ class ICE { // model parameters int dim; + int vocab_count; vector< vector > w_vertex; vector< vector > w_context; // data function void LoadEdgeList(string); + void LoadWeights(string); void LoadItemConceptList(string); void LoadDict(unordered_map>&); - void SaveWeights(string); + void SaveVertexWeights(string); + void SaveContextWeights(string); + void SaveEntityWeights(string); + int getVocabCount(); // model function void Init(int); - void Train(int, int, double, double, int); + void TrainStage1(int, int, double, double, int); + void TrainStage2(int, int, double, double, int, int); }; diff --git a/ICE/construct_graph.py b/ICE/construct_graph.py index 78b8f29..b09cf13 100644 --- a/ICE/construct_graph.py +++ b/ICE/construct_graph.py @@ -1,135 +1,218 @@ -from __future__ import print_function, division, unicode_literals +################################################################################ +# ______ ______ __ # +# / _/ /____ ____ ___ / ____/___ ____ ________ ____ / /_ # +# / // __/ _ \/ __ `__ \ / / / __ \/ __ \/ ___/ _ \/ __ \/ __/ # +# _/ // /_/ __/ / / / / / / /___/ /_/ / / / / /__/ __/ /_/ / /_ # +# /___/\__/\___/_/ /_/ /_/ \____/\____/_/ /_/\___/\___/ .___/\__/ # +# /_/ # +# ______ __ __ ___ # +# / ____/___ ___ / /_ ___ ____/ /___/ (_)___ ____ _ # +# / __/ / __ `__ \/ __ \/ _ \/ __ / __ / / __ \/ __ `/ # +# / /___/ / / / / / /_/ / __/ /_/ / /_/ / / / / / /_/ / # +# /_____/_/ /_/ /_/_.___/\___/\__,_/\__,_/_/_/ /_/\__, / # +# /____/ credit: patorjk # +################################################################################ + +# Proj: Item Concept Embedding (ICE) +# File: construct_graph.py +# Cont: +# Func: +# 1) get_user_input 2) gen_et_network +# 3) gen_tt_network 4) dict2sparse_mat +# 5) save_ice_et_network 6) save_ice_tt_network + import argparse -import csv from collections import defaultdict import numpy as np import sys -from copy import deepcopy from scipy.sparse import csr_matrix - - -if __name__ == '__main__': - PARSER = argparse.ArgumentParser(description='Transform text data to edge list file.') - - PARSER.add_argument('-et', '--entitytext', default=None, help='Entity-Text edgelist File Name') - PARSER.add_argument('-ice', '--icefile', default=None, help='ICE graph File Name') - PARSER.add_argument('-tt', '--textedges', default=None, help='Text-Text edgelist File Name') - +from tqdm import tqdm + +def get_user_input(): + """ Get inputs from user. + Return: + return1 [string] path to load the entity-text relation edge list. + return2 [string] path to load the text-text relation edge list. + return3 [list] of string paths to save the expanded entity-text + subnetwork within an ICE network. + return4 [list] of string paths to save the text-text subnetwork within + an ICE network. + return5 [int] indicator of whether to use binary or real weights. + """ + PARSER = argparse.ArgumentParser(description='Construct ICE network from ET and TT relations.') + PARSER.add_argument('-et', help='Path to load ET relation.') + PARSER.add_argument('-tt', help='Path to load TT relation.') + PARSER.add_argument('-ice_full', help='(Optoinal) Path to save full ICE network.') + PARSER.add_argument('-ice_et', help='(Optional) Path to save ET part of ICE network.') + PARSER.add_argument('-ice_tt', help='(Optional) Path to save TT part of ICE network.') + PARSER.add_argument('-w', type=int, default=0, choices=[0,1], help='(Default) 0:unweighted / 1:weighted.') CONFIG = PARSER.parse_args() - if CONFIG.entitytext == None: - print("Please give Entity-Text edgelist!\nusage: graph.py [-h] [-et ENTITYTEXT] [-o OUTPUT] [-tt TEXTEDGES]") + if CONFIG.et == None: + print('Please specify a path to load ET relation edge list.') sys.exit() - elif CONFIG.icefile == None: - print("Please give ICE graph name\nusage: graph.py [-h] [-et ENTITYTEXT] [-o OUTPUT] [-tt TEXTEDGES]") + elif CONFIG.tt == None: + print('Please specify a path to load TT relation edge list.') sys.exit() - elif CONFIG.textedges == None: - print("Please give Text-Text edgelist\nusage: graph.py [-h] [-et ENTITYTEXT] [-o OUTPUT] [-tt TEXTEDGES]") + elif CONFIG.ice_full == CONFIG.ice_et == CONFIG.ice_tt == None: + print('Please specify at least one path to save the full or part of the ICE network.') sys.exit() - input_path = CONFIG.entitytext - output_path = CONFIG.icefile - word_edge_path = CONFIG.textedges - - item_count = 0 - vocabulary_set = set() - word2id = {} - id2word = {} - entity_text_dict = defaultdict(set) - text_text_dict = defaultdict(set) - print(input_path, output_path, word_edge_path) - - print('Construct item-text structure...') - with open(input_path) as f: + et_save_list = [] + tt_save_list = [] + + if CONFIG.ice_full != None: + et_save_list.append((CONFIG.ice_full, 'w')) + tt_save_list.append((CONFIG.ice_full, 'a')) + if CONFIG.ice_et != None: + et_save_list.append((CONFIG.ice_et, 'w')) + if CONFIG.ice_tt != None: + tt_save_list.append((CONFIG.ice_tt, 'w')) + + return CONFIG.et, CONFIG.tt, et_save_list, tt_save_list, CONFIG.w + + +def gen_et_network(et_path): + """ Construct an entity-text network from entity-text relations. + Param: + param1 [string] path to an edge list file where entity-text relations + are defined. + Return: + return1 [dict] where key=entity & val=list of 2-tuples of rep word and + respective weight. + Note: + 1) Assume word set in ET is a subset of word set in TT as a word only in + TT will NOT benefit from ICE algorithm and is assumed to be removed. + """ + et_dict = defaultdict(set) + + with open(et_path) as f: for line in f: - line = line.split() - fromNode = line[0] - toNode = line[1] - vocabulary_set.add(toNode) - if fromNode not in entity_text_dict: - item_count += 1 - entity_text_dict[fromNode].add(toNode) - - - - print('Construct text-text structure...') - with open(word_edge_path) as f: + entry = line.split() + et_dict[entry[0]].add((entry[1], float(entry[2]))) # directed + + return et_dict + + +def gen_tt_network(tt_path): + """ Construct a text-text network from text-text relations. + Param: + param1 [string] path to an edge list file where text-text relations are + defined. + Return: + return [dict] where key=rep word & val=list of 2-tuples of exp word and + respective weight. + """ + tt_dict = defaultdict(set) + word_set = set() + + with open(tt_path) as f: for line in f: - line = line.split() - fromNode = line[0] - toNode = line[1] - text_text_dict[fromNode].add(toNode) - text_text_dict[toNode].add(fromNode) - vocabulary_set.update([fromNode, toNode]) - vocabulary_set.update([toNode, fromNode]) - for word in vocabulary_set: - text_text_dict[word].add(word) - - vocabulary_list = list(vocabulary_set) - vocabulary_count = len(vocabulary_list) - for i in range(len(vocabulary_list)): - word2id[vocabulary_list[i]] = i - id2word[i] = vocabulary_list[i] - - i = 0 - row = [] - column = [] - data = [] - print("Build entity-text Matrix...") - entity_list = [] - for entity, words in entity_text_dict.items(): - entity_list.append(entity) - words_index = [word2id[w] for w in words] - for w in words_index: - row.append(i) - column.append(w) - data.append(1) - i += 1 - row = np.array(row) - column = np.array(column) - data = np.array(data) - M_et = csr_matrix((data, (row, column)), shape=(item_count, vocabulary_count), dtype=np.int8) - - i = 0 - row = [] - column = [] - data = [] - print("Build text-text Matrix...") - for word, words in text_text_dict.items(): - words_index = [word2id[w] for w in words] - for w in words_index: - row.append(word2id[word]) - column.append(w) - data.append(1) - i += 1 - row = np.array(row) - column = np.array(column) - data = np.array(data) - M_tt = csr_matrix((data, (row, column)), shape=(vocabulary_count, vocabulary_count), dtype=np.int8) - - print("Build ICE Graph...") - - A = M_et.dot(M_tt) - - edge_list = [] - result = A.nonzero() - size = result[0].size - row = result[0] - col = result[1] - for i in range(size): - edge_list.append((entity_list[row[i]], vocabulary_list[col[i]])) - # print("{}/{}".format(i, size), end='\r') - - for i in range(len(vocabulary_list)): - word = vocabulary_list[i] - word2word = [x for x in text_text_dict[word]] - if len(word2word) == 0: - continue - for w in word2word: - edge_list.append((word, w)) - - - print("Write edge list...") - with open(output_path, 'w') as f: - for edge in edge_list: - f.write(edge[0] + ' ' + edge[1] + ' 1\n') + entry = line.split() + tt_dict[entry[0]].add((entry[1], float(entry[2]))) # undirected + tt_dict[entry[1]].add((entry[0], float(entry[2]))) + tt_dict[entry[0]].add((entry[0], float(1))) # cos similarity to self + tt_dict[entry[1]].add((entry[1], float(1))) + word_set.update([entry[0], entry[1]]) + + return tt_dict, list(word_set) + + +def dict2sparse_mat(edge_dict, node2row, node2col): + """ Convert dictionary-based to sparse-matrix-based network. + Param: + param1 [dict] where key=from node & val=list of 2-tuples of to node and + respective weight. + param2 [dict] where key=node & val=row number. + param3 [dict] where key=node & val=col number. + Return: + return1 [csr_matrix] sparse matrix network. + """ + row_list = [] + col_list = [] + weight_list = [] + for from_node, tup_list in tqdm(edge_dict.items()): + for to_node, weight in tup_list: + row_list.append(node2row[from_node]) + col_list.append(node2col[to_node]) + weight_list.append(weight) + + return csr_matrix((weight_list, (row_list, col_list)), shape=(len(node2row), len(node2col)), dtype=np.float64) + + +def save_ice_et_network(exp_et_matrix, row2entity, col2word, et_save_list, weighted): + """ Save the expanded entity-text subnetwork within an ICE network. + Param: + param1 [csr_matrix] sparse matrix of the expanded entity-text network. + param2 [dict] where key=row number & val=entity. + param3 [dict] where key=col number & val=rep word + param4 [list] of 2-tuples of path and input mode. + param5 [int] indicator of whether to use binary or real weights. + """ + et_f_list = [open(et_path, mode) for et_path, mode in et_save_list] + + if weighted == 1: + for row, col in tqdm(list(zip(*exp_et_matrix.nonzero()))): + entry = row2entity[row] + ' ' + col2word[col] + ' ' + str(exp_et_matrix[row, col]) + '\n' + + for et_f in et_f_list: + et_f.write(entry) + else: + for row, col in tqdm(list(zip(*exp_et_matrix.nonzero()))): + entry = row2entity[row] + ' ' + col2word[col] + ' 1.0\n' + + for et_f in et_f_list: + et_f.write(entry) + + +def save_ice_tt_network(tt_dict, tt_save_list, weighted): + """ Save the text-text subnetwork within an ICE network. + Param: + param1 [dict] where key=rep word & val=list of 2-tuples of rep word and + respective weight. + param2 [list] of 2-tuples of path and input mode. + param3 [int] indicator of whether to use binary or real weights. + """ + tt_f_list = [open(tt_path, mode) for tt_path, mode in tt_save_list] + + for rep_word, tup_list in tqdm(tt_dict.items()): + for exp_word, weight in tup_list: + entry = rep_word + ' ' + exp_word + ' ' + str(weight**weighted) + '\n' + + for tt_f in tt_f_list: + tt_f.write(entry) + + +def main(): + # Step 0: Get inputs from user. + et_path, tt_path, et_save_list, tt_save_list, w = get_user_input() + + print('\nStart constructing ICE network!') + print('Step 1-1: Construct ET network from ET relations...') + et_dict = gen_et_network(et_path) + + print('Step 1-2: Construct TT network from TT relations...') + tt_dict, word_list = gen_tt_network(tt_path) + + entity_list = list(et_dict.keys()) + entity2index = {entity:index for index, entity in enumerate(entity_list)} + word2index = {word:index for index, word in enumerate(word_list)} + print('Step 2-1: Convert entity-text matrix into a sparse matrix...') + et_matrix = dict2sparse_mat(et_dict, entity2index, word2index) + + print('Step 2-2: Convert text-text matrix into a sparse matrix...') + tt_matrix = dict2sparse_mat(tt_dict, word2index, word2index) + + print('Step 3: Perform concept expansion...') + exp_et_matrix = et_matrix.dot(tt_matrix) + + print('Step 4-1: Save ET part of the ICE network...') + save_ice_et_network(exp_et_matrix, entity_list, word_list, et_save_list, w) + + print('Step 4-2: Save TT part of the ICE network...') + save_ice_tt_network(tt_dict, tt_save_list, w) + + print('Finished constructing ICE network!\n') + +if __name__ == '__main__': + main() diff --git a/ICE/main.cpp b/ICE/main.cpp index 5bbaf23..ce67835 100644 --- a/ICE/main.cpp +++ b/ICE/main.cpp @@ -21,10 +21,18 @@ int main(int argc, char **argv){ printf("[ICE-CLI]\n"); printf("\tcommand line interface for ICE\n\n"); printf("Options:\n"); - printf("\t-train \n"); - printf("\t\tTrain the Network data\n"); + // printf("\t-train \n"); + // printf("\t\tTrain the Network data\n"); + printf("\t-text \n"); + printf("\t\ttext-text network data\n"); + printf("\t-entity \n"); + printf("\t\tentity-text network data\n"); + printf("\t-textrep \n"); + printf("\t\tsave the word representation data\n"); + printf("\t-textcontext \n"); + printf("\t\tsave the word context data\n"); printf("\t-save \n"); - printf("\t\tSave the representation data\n"); + printf("\t\tSave the entity representation data\n"); printf("\t-save_times \n"); printf("\t\tsave a model times; default is 1\n"); printf("\t-dim \n"); @@ -38,16 +46,21 @@ int main(int argc, char **argv){ printf("\t-alpha \n"); printf("\t\tInit learning rate; default is 0.025\n"); printf("\nExample Usage:\n"); - printf("\t./ice -train network.txt -save rep.txt -dim 64 -sample 10 -neg 5 -alpha 0.025 -thread 4\n\n"); + printf("\t./ice -text ../data/word.txt -entity ../data/entity.txt -textrep ../data/text.embd -save ../data/result.txt -textcontext ../data/context.txt -dim 4 -sample 10 -neg 5 -alpha 0.025 -thread 4\n\n"); return 0; } - char network_file[100], rep_file[100]; + char text_file[100], text_rep_file[100], text_context_file[100], entity_file[100], result_file[100]; int dimensions=64, negative_samples=5, sample_times=10, save_times=1, threads=1; double init_alpha=0.025; - if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(network_file, argv[i + 1]); - if ((i = ArgPos((char *)"-save", argc, argv)) > 0) strcpy(rep_file, argv[i + 1]); + // if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(network_file, argv[i + 1]); + // if ((i = ArgPos((char *)"-save", argc, argv)) > 0) strcpy(rep_file, argv[i + 1]); + if ((i = ArgPos((char *)"-text", argc, argv)) > 0) strcpy(text_file, argv[i + 1]); + if ((i = ArgPos((char *)"-entity", argc, argv)) > 0) strcpy(entity_file, argv[i + 1]); + if ((i = ArgPos((char *)"-textrep", argc, argv)) > 0) strcpy(text_rep_file, argv[i + 1]); + if ((i = ArgPos((char *)"-textcontext", argc, argv)) > 0) strcpy(text_context_file, argv[i + 1]); + if ((i = ArgPos((char *)"-save", argc, argv)) > 0) strcpy(result_file, argv[i + 1]); if ((i = ArgPos((char *)"-save_times", argc, argv)) > 0) save_times = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-dim", argc, argv)) > 0) dimensions = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample_times = atoi(argv[i + 1]); @@ -57,8 +70,9 @@ int main(int argc, char **argv){ ICE *ice; ice = new ICE(); - ice->LoadEdgeList(network_file); + ice->LoadEdgeList(text_file); ice->Init(dimensions); + ice->getVocabCount(); int sub_sample_times = sample_times/save_times, current_sample_times=0; double alpha_max=init_alpha, alpha_min=init_alpha; @@ -68,16 +82,46 @@ int main(int argc, char **argv){ alpha_max = alpha_min; alpha_min = init_alpha*((double)(save_times-i-1)/save_times); if (alpha_min < init_alpha*0.0001) alpha_min = init_alpha*0.0001; - ice->Train(sub_sample_times, negative_samples, alpha_max, alpha_min, threads); + ice->TrainStage1(sub_sample_times, negative_samples, alpha_max, alpha_min, threads); if (i==(save_times-1)) { - ice->SaveWeights(rep_file); + ice->SaveVertexWeights(text_rep_file); + ice->SaveContextWeights(text_context_file); } else { - string sub_rep_file = rep_file + string(".") + to_string(current_sample_times); - ice->SaveWeights(sub_rep_file); + string sub_text_rep_file = text_rep_file + string(".") + to_string(current_sample_times); + ice->SaveVertexWeights(sub_text_rep_file); + string sub_text_context_file = text_context_file + string(".") + to_string(current_sample_times); + ice->SaveContextWeights(sub_text_context_file); + + } + } + + ice->LoadEdgeList(entity_file); + ice->Init(dimensions); + ice->LoadWeights(text_context_file); + + sub_sample_times = sample_times/save_times, current_sample_times=0; + alpha_max=init_alpha, alpha_min=init_alpha; + for (int i=0; iTrainStage2(sub_sample_times, negative_samples, alpha_max, alpha_min, threads, ice->vocab_count); + + if (i==(save_times-1)) + { + ice->SaveEntityWeights(result_file); + } + else + { + string sub_result_file = result_file + string(".") + to_string(current_sample_times); + ice->SaveEntityWeights(sub_result_file); + } } diff --git a/ICE/vvNet.cpp b/ICE/vvNet.cpp index 7f7714b..45648fb 100644 --- a/ICE/vvNet.cpp +++ b/ICE/vvNet.cpp @@ -86,6 +86,42 @@ int vvNet::SearchHashTable(string key) } } +void vvNet::LoadWeights(string filename, vector< vector > &w_context){ + FILE *fin; + char *pch; + char c_line[10000]={'\0'}; + string key; + double val = 0; + long key_id = 0; + int i; + + fin = fopen(filename.c_str(), "rb"); + while (fgets(c_line, sizeof(c_line), fin)) + { + bool is_first = true; + i = -1; + + pch = strtok(c_line, " "); + key = pch; + // cout << key << endl; + key_id = SearchHashTable(key); + if(key_id == -1) + continue; + // cout << key << ": " << key_id << endl; + while(pch != NULL){ + if(!is_first){ + val = strtod(pch, NULL); + // cout << val << endl; + w_context[key_id][i] = val; + } + is_first = false; + pch = strtok(NULL, " "); + i++; + } + } + fclose(fin); +} + void vvNet::LoadEdgeList(string filename) { // calculate the total connections @@ -93,85 +129,67 @@ void vvNet::LoadEdgeList(string filename) { char c_line[10000]; vector< string > filenames; vector< int > filelines; - - // load from a folder or from a file - if (isDirectory(filename.c_str())) - { - DIR *dir; - struct dirent *ent; - dir = opendir(filename.c_str()); - while ((ent = readdir (dir)) != NULL) { - string fname = filename + "/" + ent->d_name; - filenames.push_back(fname); - } - closedir(dir); - } - else - { - filenames.push_back(filename.c_str()); - } - + int lines = 0; + cout << "Preview:" << endl; - for (auto fname: filenames) + fin = fopen(filename.c_str(), "rb"); + while (fgets(c_line, sizeof(c_line), fin)) { - fin = fopen(fname.c_str(), "rb"); - while (fgets(c_line, sizeof(c_line), fin)) + if (MAX_edge % MONITOR == 0) { - if (MAX_edge % MONITOR == 0) - { - printf("\t# of connection:\t%lld%c", MAX_edge, 13); - } - ++MAX_edge; + printf("\t# of connection:\t%lld%c", MAX_edge, 13); } - fclose(fin); - filelines.push_back(MAX_edge); + ++lines; + ++MAX_edge; } - cout << "\t# of connection:\t" << MAX_edge << endl; + fclose(fin); + cout << "\t# of connection:\t" << lines << endl; // load the connections char v1[160], v2[160]; double w; long vid1, vid2; - unordered_map< long, unordered_map > vv; // vertex -> vertex: weight + // unordered_map< long, unordered_map > vv; // vertex -> vertex: weight cout << "Connections Loading:" << endl; unsigned long long line = 0; - for (int i=0; i" << vid1 << endl; } - fclose(fin); + vid2 = SearchHashTable(v2); + if (vid2 == -1) + { + vid2 = InsertHashTable(v2); + // cout << v2 << ": insert ->" << vid2 << endl; + } + + vv[vid1][vid2] = w; + MAX_edge += 1; + if (line % MONITOR == 0) + { + printf("\tProgress:\t\t%.2f %%%c", (double)(line)/(MAX_edge+1) * 200, 13); + fflush(stdout); + } } + fclose(fin); + + cout << "\tProgress:\t\t100.00 %\r" << endl; cout << "\t# of vertex:\t\t" << MAX_vid << endl; diff --git a/ICE/vvNet.h b/ICE/vvNet.h index c669d69..3f1e52c 100644 --- a/ICE/vvNet.h +++ b/ICE/vvNet.h @@ -43,6 +43,7 @@ class Vertex { Vertex() { out_degree=0.0; in_degree=0.0; } }; +// add flag class Context { public: long vid; @@ -80,6 +81,8 @@ class vvNet { vector< AliasTable > vertex_AT; vector< AliasTable > context_AT; vector< AliasTable > negative_AT; + + unordered_map< long, unordered_map > vv; // cahce vector< double > cached_sigmoid; @@ -100,6 +103,7 @@ class vvNet { // Data Process void LoadEdgeList(string); void LoadItemConceptList(string); + void LoadWeights(string, vector< vector > &); void LoadDict(unordered_map>&); // Network Process @@ -115,6 +119,7 @@ class vvNet { // vertex vector, context vector, vertex, context, dimension, negative samples, alpha void UpdateVertex(vector< vector >&, vector< vector >&, long, long, int, int, double); + void UpdateContext(vector< vector >&, vector< vector >&, long, long, int, int, double); // vertex vector, context vector, vertex, context, dimension, negative samples, community walk steps, alpha void UpdateCommunity(vector< vector >&, vector< vector >&, long, long, int, int, int, double);