CWS_gen_mp.py

''' 
    Author: Lemay.ai
    License: GPLv3
'''

# This is a dirty hack based on the CWS generation script that I didn't want to write and I'm
# sorry.

# The gc causes a segfault when it goes to clean up old models, but the program runs *fine* if
# it reaches the end - because instead of doing GC it just tells the system it's done. Therefore,
# the solution is to avoid using the GC, via the dirty dirty hack of running every model in a
# new subprocess.

import os, sys
import glob

import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Turns off some *annoying* warnings

import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.special import comb
from sklearn.model_selection import train_test_split

from multiprocessing import Process, Queue

###################
# Functions
###################

# Modified CWS index calculation that spits out numerator and denominator separately.
# Should be functionally identical to the below.
def CWS_index_modified(results):
    ntests, ncategories = results.shape
    possible_matches = comb(ncategories*ntests, 2)
    possible_in_col_matches = ncategories*comb(ntests, 2)
    possible_across_col_matches = possible_matches - possible_in_col_matches
    
    # Count all matches total.
    categories, category_counts = np.unique(results, return_counts=True)
    all_matches = 0
    for ix, cat in enumerate(categories):
        all_matches += comb(category_counts[ix], 2)
    
    # Count within-column matches
    in_col_matches = 0
    for i in range(ncategories):
        col = results[:,i]
        cats, cat_counts = np.unique(col, return_counts=True)
        for ix, cat in enumerate(cats):
            in_col_matches += comb(cat_counts[ix], 2)
    in_col_mismatches = possible_in_col_matches - in_col_matches
    
    # Cross-column matches are then easy
    cross_col_matches = all_matches - in_col_matches
    cross_col_mismatches = possible_across_col_matches - cross_col_matches
    
    # And the numerator/denominator are easy.
    numerator = cross_col_mismatches / possible_across_col_matches
    denominator = in_col_mismatches / possible_in_col_matches
    
    return numerator, denominator

# Original CWS calculation, for reference/double-checking
def CWS_index(results):
    ncategories = len(results[0])
    ntests = len(results)
    
    possible_matches = comb(ncategories*ntests, 2)
    per_col_matches = comb(ntests, 2)
    
    possible_across_col_matches = possible_matches - ncategories*per_col_matches
    
    lang_counts_by_col = []
    for i in range(ncategories):
        column = results[:,i]
        lang_counts_by_col.append(np.unique(column, return_counts = True))
        
    lang_counts = np.unique(results, return_counts = True)
    
    all_col_matches = 0
    for lang_count in lang_counts[1]:
        all_col_matches += comb(lang_count, 2)
    
    within_col_matches = 0
    for i in lang_counts_by_col:
        for j in range(len(i[0])):
            within_col_matches += comb(i[1][j], 2)
            
    actual_across_col_matches = all_col_matches - within_col_matches
    
    numerator = (possible_across_col_matches - actual_across_col_matches)/possible_across_col_matches
    denominator = (10*per_col_matches - within_col_matches)/(10*per_col_matches)
    CWS = numerator/denominator
    return CWS

# Constructs X and Y vectors for a given df containing body, tags, and vector
def construct_X_Y(df, vecSize=100, verbose=False):
    Y = pd.get_dummies(df['tags']).values
    
    vectors = df['vector'].values
    X_rows = vectors.shape[0]
    X = np.empty([X_rows,vecSize,300])
    
    it = range(X_rows)
    if verbose:
        it = tqdm(it, desc='generating X/Y')
    for i in it:
        if len(vectors[i][0]) == 0:
            xs = np.zeros([1,vecSize,300])
        else:
            xd = np.stack(vectors[i][0], axis=0)
            xd = xd.swapaxes(0,1)
            xs = pad_sequences(xd, maxlen=vecSize, dtype='float32')
            xs = np.swapaxes(xs,0,1)
        X[i] = xs
    return X, Y

# Constructs modified X-vector by concatenating the custom vector generated by the sidecar.
def attach_sidecar(X, df, vecSize=100, verbose=False):
    custom_vectors = df['vector_custom'].values
    assert X.shape[0] == custom_vectors.shape[0]
    rows = X.shape[0]
    
    X_cust = np.empty([rows,vecSize,vecSize])
    it = range(rows)
    if verbose:
        it = tqdm(it, desc='attaching sidecar')
    for i in range(rows):
        if len(custom_vectors[i][0]) == 0:
            xs = np.zeros([1,vecSize,vecSize])
        else:
            xd = np.stack(custom_vectors[i][0], axis=0)
            xd = np.swapaxes(xd,0,1)
            xs = pad_sequences(xd, maxlen=vecSize, dtype='float32')
            xs = np.swapaxes(xs,0,1)
        X_cust[i]=xs
    
    # Concatenation magic
    X = np.swapaxes(X, 1, 2)
    X_cust = np.swapaxes(X_cust, 1, 2)
    X = np.hstack((X, X_cust))
    return np.swapaxes(X, 1, 2)

# Does test logic on a model
def test_model(model, X, Y, seed=42, verbose=False):
    x_train, x_test, y_train, y_test=train_test_split(X, Y, test_size=0.05, random_state=seed)
    
    # Sorts tests record indexes into per-language buckets.
    # It's important for us to order by language, so the CWS calculation is easier later.
    test_by_lang = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[]}
    for i in range(len(y_test)):
        for j in range(len(y_test[i])):
            if y_test[i][j] == 1:
                test_by_lang[j].append(i)
    
    # We *really* need every language to appear at least 30 times.
    for i in range(10):
        assert len(test_by_lang[i]) > 30
    
    # The loop below was convenient to write but it's inefficient. In theory we should really
    # just be calling model.predict on the entire test set at once.
    results = []
    it = range(10)
    if verbose:
        it = tqdm(it, desc='languages')
    for lang in it:
        lang_col = []
        it2 = test_by_lang[lang][:30]
        if verbose:
            it2 = tqdm(it2, desc='inputs')
        for lang_record in it2:
            pred = model.predict(np.array([x_test[lang_record]]))
            lang_col.append(list(pred[0]).index(max(pred[0])))
        results.append(lang_col)
    results = np.transpose(np.array(results))
    
    return results

# Iterates over models in a directory, and returns the CWS scores
def generate_CWS(model_dir, X, Y, verbose=False):
    CWS_scores = np.zeros((10, 2))
    it = range(10)
    if verbose:
        it = tqdm([i for i in it], desc='variants')
    q = Queue()
    for seed in it:
        def proc_func(q):
            model_path = os.path.join(model_dir, 'model_{}.h5'.format(seed))
            model = keras.models.load_model(model_path)
            results = test_model(model, X, Y, verbose=verbose, seed=seed)
            q.put(CWS_index_modified(results))
        p = Process(target=proc_func, args=(q,))
        p.start()
        p.join()
        CWS_scores[seed] = q.get()
    return CWS_scores

###################
# Main
###################

def main():
    # Iterable of data file name, and model file directory, and whether or not to cocnatenate
    # the custom vectors.
    test_data = [
        ('word2vec_vectors_window_6.pkl', 'word2vec_width_90', False), # Word2Vec on its own
        ('word2vec_vectors_window_6.pkl', 'word2vec_with_custom_window_6_width_80', True), # Word2Vec plus sidecar
        ('glove_vectors_window_4.p', 'glove_width_90', False), # GloVe on its own
        ('glove_vectors_window_4.p', 'glove_with_custom_window_4_width_80', True), # GloVe plus sidecar
        ('fastText_vectors_window_5.p', 'fasttext_width_80', False), # fastText on its own
        ('fastText_vectors_window_5.p', 'fasttext_with_custom_window_5_width_70', True) # fastText plus sidecar
    ]
    
    for data_file, model_dir, has_sidecar in tqdm(test_data, desc='models'):
        def proc_func(data_file, model_dir, has_sidecar):
            print('Reading vector file')
            df = pd.read_pickle(data_file)
            print('Constructing X and Y vectors')
            X, Y = construct_X_Y(df, verbose=True)
            if has_sidecar:
                print('Attaching sidecar')
                X = attach_sidecar(X, df, verbose=True)
            CWS_scores = generate_CWS(model_dir, X, Y, verbose=True)
            np.savetxt(os.path.join('CWS_csvs', model_dir + '_CWS.csv'), CWS_scores,
                    delimiter=',')
        p = Process(target=proc_func, args=(data_file, model_dir, has_sidecar))
        p.start()
        p.join()

if __name__ == '__main__':
    main()