data.py

import os
import re
import collections
import copy
import pickle

import numpy as np
import sklearn.datasets
from scipy.sparse import csr_matrix, vstack


AVAILABLE_DATASETS = ["20 Newsgroups", "RT Polarity", "RCV1", "RCV1-Vectors-Original", "RCV1-Vectors-Custom"]
DEFAULT_VOCAB_SIZES = [10000, 5000, 10000, None, None]
DEFAULT_SEQ_LENS = [1000, 56, 1000, None, None]


class TextDataset(object):

    def clean_text(self):
        """
        Tokenization & string cleaning.
        """
        for i, string in enumerate(self.documents):
            string = re.sub(r"[^A-Za-z0-9(),!?'$]", " ", string)
            string = re.sub(r"(\d+)", " NUM ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r"\(", " ( ", string)
            string = re.sub(r"\)", " ) ", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\?", " ? ", string)
            string = re.sub(r"\$", " dollar ", string)
            string = re.sub(r"\s{2,}", " ", string)
            self.documents[i] = string.strip().lower()

    def keep_documents(self, idx):
        """
        Keep the documents given by the index, discard the others.
        """
        self.documents = [self.documents[i] for i in idx]
        self.labels = self.labels[idx]
        try:
            self.data_count = self.data_count[idx, :]
        except AttributeError:
            pass

    def keep_words(self, idx):
        """
        Keep the words given by the index, discard the others.
        """
        self.vocab = [self.vocab[i] for i in idx]
        self.data_count = self.data_count[:, idx]

    def remove_short_documents(self, nwords, vocab="selected"):
        """
        Remove documents that contain less than nwords.
        """
        if vocab is "selected":
            # Word count with selected vocabulary
            wc = self.data_count.sum(axis=1)
            wc = np.squeeze(np.asarray(wc))
        elif vocab is "full":
            # Word count with full vocabulary
            wc = np.empty(len(self.documents), dtype=np.int)
            for i, doc in enumerate(self.documents):
                wc[i] = len(doc.split())
        idx = np.argwhere(wc >= nwords).squeeze()
        self.keep_documents(idx)

    def keep_top_words(self, N):
        """
        Keep only the N words that appear most often.
        """
        freq = self.data_count.sum(axis=0)
        freq = np.squeeze(np.asarray(freq))
        idx = np.argsort(freq)[::-1]
        idx = idx[:N]
        self.keep_words(idx)

    def count_vectorize(self, **params):
        """
        Vectorize the documents in the dataset using CountVectorizer(**params).
        """
        self.count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(**params)
        self.data_count = self.count_vectorizer.fit_transform(self.documents)
        self.vocab = self.count_vectorizer.get_feature_names()
        assert len(self.vocab) == self.data_count.shape[1]

    def tfidf_normalize(self):
        """
        Transform data_count to tf-idf and store in data_tfidf. Do this at the very end.
        """
        transformer = sklearn.feature_extraction.text.TfidfTransformer(norm="l1")
        self.data_tfidf = transformer.fit_transform(self.data_count)

    def generate_word2ind(self, maxlen=None, padding="post", truncating="post"):
        """
        Transforms documents to list of self.vocab indexes of the same length (i.e. maxlen). Do this at the
        very end.
        """
        # Add "<UNK>" to vocabulary (for padding) and create a reverse vocabulary lookup
        if self.vocab[-1] != "<UNK>":
            self.vocab = self.vocab + ["<UNK>"]
        reverse_vocab = {w: i for i, w in enumerate(self.vocab)}

        # Tokenize all the documents using the CountVectorizer's analyzer
        analyzer = self.count_vectorizer.build_analyzer()
        tokenized_docs = np.array([analyzer(doc) for doc in self.documents])

        # Transform documents from words to indexes using vocabulary
        sequences = np.array([[reverse_vocab[w] for w in tokens if w in reverse_vocab]
                              for tokens in tokenized_docs])

        # Truncate or pad sequences to match maxlen (adapted from tflearn.data_utils.pad_sequences)
        lengths = [len(s) for s in sequences]
        num_samples = len(sequences)
        if maxlen is None:
            maxlen = np.max(lengths)

        x = np.ones((num_samples, maxlen), np.int64) * (len(self.vocab) - 1)
        for idx, s in enumerate(sequences):
            if len(s) == 0:
                continue  # empty list was found
            if truncating == "pre":
                trunc = s[-maxlen:]
            elif truncating == "post":
                trunc = s[:maxlen]

            if padding == "post":
                x[idx, :len(trunc)] = trunc
            elif padding == "pre":
                x[idx, -len(trunc):] = trunc

        self.data_word2ind = x

    def generate_out(self, out, **params):
        if out == "count":
            self.data = self.data_count
        elif out == "tfidf":
            self.tfidf_normalize(**params)  # transform count matrix into a tf-idf matrix
            self.data = self.data_tfidf
        elif out == "word2ind":
            self.generate_word2ind(**params)  # transform documents to sequences of vocab indexes
            self.data = self.data_word2ind


class Text20News(TextDataset):
    """
    20 Newsgroups dataset.
    Dataset retrieved from scikit-learn (http://scikit-learn.org/stable/datasets/twenty_newsgroups.html)
    """

    def __init__(self, subset, remove=("headers", "footers", "quotes"), categories=None,
                 shuffle=True, random_state=42):
        dataset = sklearn.datasets.fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle,
                                                      random_state=random_state, remove=remove)
        self.documents = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names
        assert max(self.labels) + 1 == len(self.class_names)

    def preprocess_train(self, out, vocab_size=10000, **params):
        self.remove_short_documents(nwords=20, vocab="full")  # remove documents < 20 words in length
        self.clean_text()  # tokenize & clean text
        self.count_vectorize(stop_words="english")  # create term-document count matrix and vocabulary
        self.orig_vocab_size = len(self.vocab)
        self._remove_encoded_images()  # remove encoded images
        self.keep_top_words(vocab_size)  # keep only the top vocab_size words
        self.remove_short_documents(nwords=5, vocab="selected")  # remove docs whose signal would be the zero vector
        self.generate_out(out, **params)  # generate final self.data

    def preprocess_test(self, train_vocab, out, **params):
        self.clean_text()
        self.count_vectorize(vocabulary=train_vocab)
        self.remove_short_documents(nwords=5, vocab="selected")
        self.generate_out(out, **params)

    def _remove_encoded_images(self, freq=1e3):
        widx = self.vocab.index("ax")
        wc = self.data_count[:, widx].toarray().squeeze()
        idx = np.argwhere(wc < freq).squeeze()
        self.keep_documents(idx)


class TextRTPolarity(TextDataset):
    """
    Pang and Lee's movie review sentiment polarity dataset.
    http://www.cs.cornell.edu/people/pabo/movie-review-data/
    """

    def __init__(self, shuffle=True, random_state=10):
        # Load data from files
        positive_examples = list(open("data/RT Polarity/rt-polarity.pos", "r", encoding="utf-8").readlines())
        positive_examples = [s.strip() for s in positive_examples]
        negative_examples = list(open("data/RT Polarity/rt-polarity.neg", "r", encoding="utf-8").readlines())
        negative_examples = [s.strip() for s in negative_examples]

        # Save documents
        self.documents = np.array(positive_examples + negative_examples)

        # Save target labels
        positive_labels = [0 for _ in positive_examples]
        negative_labels = [1 for _ in negative_examples]
        self.labels = np.array(positive_labels + negative_labels)

        # Save class names
        self.class_names = ["pos", "neg"]

        # Shuffle data
        if shuffle:
            np.random.seed(random_state)
            shuffle_indices = np.random.permutation(np.arange(len(self.labels)))
            self.documents = self.documents[shuffle_indices]
            self.labels = self.labels[shuffle_indices]

    def preprocess(self, out, vocab_size=5000, **params):
        self.clean_text()  # tokenize & clean text
        self.count_vectorize()  # create term-document count matrix and vocabulary
        self.orig_vocab_size = len(self.vocab)
        self.keep_top_words(vocab_size)  # keep only the top vocab_size words
        self.generate_out(out, **params)  # generate final self.data


class TextRCV1(TextDataset):
    """
    Reuters RCV1 dataset.
    Paper: http://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf
    """

    def __init__(self):
        self.documents, self.labels, self.class_names = self._load()

        assert len(self.class_names) == 103  # 103 categories according to LYRL2004
        N, C = self.labels.shape
        assert C == len(self.class_names)

    def preprocess(self, out, vocab_size=2000, **params):
        # Selection of classes
        keep = ['C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C21', 'C22', 'C23', 'C24',
                'C31', 'C32', 'C33', 'C34', 'C41', 'C42', 'E11', 'E12', 'E13', 'E14', 'E21', 'E31',
                'E41', 'E51', 'E61', 'E71', 'G15', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT', 'GENV',
                'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL', 'GPRO', 'GREL', 'GSCI',
                'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA', 'GWELF', 'M11', 'M12', 'M13', 'M14']
        assert len(keep) == 55  # 55 second-level categories according to LYRL2004
        keep.remove('C15')  # 130,426 documents after removing multiple class documents
        keep.remove('GOBIT')  # 5 documents after removing multiple class documents
        keep.remove('GMIL')  # 1 document after removing multiple class documents
        self._keep_classes(keep)

        # Remove documents with multiple classes
        classes_per_doc = np.array(self.labels.sum(axis=1)).squeeze()
        self.labels = self.labels[classes_per_doc == 1]
        self.documents = [self.documents[i] for i in range(len(self.documents)) if classes_per_doc[i] == 1]

        # Convert target from one-hot sparse matrix to labels
        N, C = self.labels.shape
        labels = self.labels.tocoo()
        self.labels = labels.col
        assert self.labels.min() == 0
        assert self.labels.max() == C - 1

        self.clean_text()  # tokenize & clean text
        self.count_vectorize(stop_words="english")  # create term-document count matrix and vocabulary
        self.orig_vocab_size = len(self.vocab)
        self.keep_top_words(vocab_size)  # keep only the top vocab_size words
        self.remove_short_documents(nwords=5, vocab="selected")  # remove docs whose signal would be the zero vector
        self.generate_out(out, **params)  # generate final self.data

    def _load(self):
        data_dir = os.path.abspath(os.path.join(os.path.curdir, "data", "RCV1", "pickles", "RCV1-v2_Sparse"))

        class_names = pickle.load(open(data_dir + "/class_names.pkl", "rb"))

        pkl_files = os.listdir(data_dir)
        docs_pkls = list(filter(lambda x: x.startswith("documents"), pkl_files))
        labels_pkls = list(filter(lambda x: x.startswith("labels"), pkl_files))
        docs_pkls.sort()
        labels_pkls.sort()

        documents = []
        for docs_pkl in docs_pkls:
            documents += pickle.load(open(data_dir + "/" + docs_pkl, "rb"))

        _labels = []
        for labels_pkl in labels_pkls:
            _labels += pickle.load(open(data_dir + "/" + labels_pkl, "rb"))
        labels = vstack(_labels)

        return documents, labels, class_names

    def _keep_classes(self, keep):
        # Construct a lookup table for labels to keep
        class_lookup = {}
        for i, name in enumerate(self.class_names):
            class_lookup[name] = i
        self.class_names = keep

        # Get indices of classes to keep & delete everything else
        idx_keep = np.empty(len(keep))
        for i, cat in enumerate(keep):
            idx_keep[i] = class_lookup[cat]
        self.labels = self.labels[:, idx_keep]
        assert self.labels.shape[1] == len(keep)


class TextRCV1_Vectors(TextDataset):
    """
    Reuters RCV1 dataset vectors.
    Paper: http://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf
    Dataset retrieved from scikit-learn (http://scikit-learn.org/stable/datasets/rcv1.html)

    Note: Dataset contains only cosine-normalized, log tf-idf vectors (i.e. can only be used for baseline 
    models & MLP).
    """

    def __init__(self, subset, shuffle=True, random_state=42):
        if subset == "all":
            shuffle = False  # chronological split violated if shuffled
        else:
            shuffle = shuffle

        dataset = sklearn.datasets.fetch_rcv1(subset=subset, shuffle=shuffle, random_state=random_state)
        self.data = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names

        assert len(self.class_names) == 103  # 103 categories according to LYRL2004
        N, C = self.labels.shape
        assert C == len(self.class_names)

        N, V = self.data.shape
        self.vocab = np.zeros(V)  # hacky workaround to create placeholder value
        self.orig_vocab_size = V

    def preprocess(self, out, **params):
        # Selection of classes
        keep = ['C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C21', 'C22', 'C23', 'C24',
                'C31', 'C32', 'C33', 'C34', 'C41', 'C42', 'E11', 'E12', 'E13', 'E14', 'E21', 'E31',
                'E41', 'E51', 'E61', 'E71', 'G15', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT', 'GENV',
                'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL', 'GPRO', 'GREL', 'GSCI',
                'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA', 'GWELF', 'M11', 'M12', 'M13', 'M14']
        assert len(keep) == 55  # 55 second-level categories according to LYRL2004
        keep.remove('C15')  # 130,426 documents after removing multiple class documents
        keep.remove('GOBIT')  # 5 documents after removing multiple class documents
        keep.remove('GMIL')  # 1 document after removing multiple class documents
        self._keep_classes(keep)

        # Remove documents with multiple classes
        classes_per_doc = np.array(self.labels.sum(axis=1)).squeeze()
        self.labels = self.labels[classes_per_doc == 1]
        self.data = self.data[classes_per_doc == 1, :]

        # Convert target from one-hot sparse matrix to labels
        N, C = self.labels.shape
        labels = self.labels.tocoo()
        self.labels = labels.col
        assert self.labels.min() == 0
        assert self.labels.max() == C - 1

    def _keep_classes(self, keep):
        # Construct a lookup table for labels to keep
        class_lookup = {}
        for i, name in enumerate(self.class_names):
            class_lookup[name] = i
        self.class_names = keep

        # Get indices of classes to keep & delete everything else
        idx_keep = np.empty(len(keep))
        for i, cat in enumerate(keep):
            idx_keep[i] = class_lookup[cat]
        self.labels = self.labels[:, idx_keep]
        assert self.labels.shape[1] == len(keep)


def load_dataset(dataset, out, vocab_size=None, **params):
    """
    Returns the train & test datasets for a chosen dataset. The datasets are directly loaded from stored 
    pickles (if available) or loaded from disk and preprocessed.
    """
    loaded = False

    if vocab_size is None:
        vocab_size = DEFAULT_VOCAB_SIZES[AVAILABLE_DATASETS.index(dataset)]

    if out == "word2ind" and params["maxlen"] is None:
        params["maxlen"] = DEFAULT_SEQ_LENS[AVAILABLE_DATASETS.index(dataset)]

    pickle_dir = os.path.abspath(os.path.join(os.path.curdir, "data", "pickled_datasets", dataset,
                                              "{}".format(vocab_size), out))
    if out == "word2ind":
        pickle_dir = pickle_dir + "/{}".format(params["maxlen"])

    train_file = pickle_dir + "/train.pkl"
    test_file = pickle_dir + "/test.pkl"

    if os.path.exists(train_file) and os.path.exists(test_file):
        train = pickle.load(open(train_file, "rb"))
        test = pickle.load(open(test_file, "rb"))
        loaded = True
        print("Loaded dataset from pickles.")

    if not loaded:
        train, test = prepare_dataset(dataset, out, vocab_size, **params)
        print("Dataset prepared.")

        if not os.path.exists(pickle_dir):
            os.makedirs(pickle_dir)
        pickle.dump(train, open(train_file, "wb"))
        pickle.dump(test, open(test_file, "wb"))
        print("Dataset pickled.")

    return train, test


def prepare_dataset(dataset, out, vocab_size, **params):
    """
    Prepares the chosen dataset by loading it from disk, applying all the necessary preprocessing and 
    splitting it into disjoint train/test datasets.
    """
    if dataset == "20 Newsgroups":
        print("Preparing training data...")
        train = Text20News(subset="train")
        train.preprocess_train(out=out, vocab_size=vocab_size, **params)

        print("Preparing test data...")
        test = Text20News(subset="test")
        test.preprocess_test(train_vocab=train.vocab, out=out, **params)
    elif dataset == "RT Polarity":
        print("Preparing data...")
        all_data = TextRTPolarity()
        all_data.preprocess(out=out, vocab_size=vocab_size, **params)

        # Split train/test set
        train = copy.deepcopy(all_data)
        test = copy.deepcopy(all_data)
        split_index = -1 * int(0.1 * float(all_data.data.shape[0]))  # 10% of dataset is test set
        train.documents, test.documents = all_data.documents[:split_index], all_data.documents[split_index:]
        train.data, test.data = all_data.data[:split_index], all_data.data[split_index:]
        train.labels, test.labels = all_data.labels[:split_index], all_data.labels[split_index:]
    elif dataset == "RCV1":
        print("Preparing data...")
        all_data = TextRCV1()
        all_data.preprocess(out=out, vocab_size=vocab_size, **params)

        # Split train/test set
        train = copy.deepcopy(all_data)
        test = copy.deepcopy(all_data)
        split_index = all_data.data.shape[0] // 2  # according to Bruna's paper & Hinton's dropout paper
        train.documents, test.documents = all_data.documents[:split_index], all_data.documents[split_index:]
        train.data, test.data = all_data.data[:split_index], all_data.data[split_index:]
        train.labels, test.labels = all_data.labels[:split_index], all_data.labels[split_index:]
    elif dataset == "RCV1-Vectors-Original":
        assert out == "tfidf"
        assert vocab_size == None

        print("Preparing training data...")
        train = TextRCV1_Vectors(subset="train")
        train.preprocess(out="tfidf", **params)

        print("Preparing test data...")
        test = TextRCV1_Vectors(subset="test")
        test.preprocess(out="tfidf", **params)
    elif dataset == "RCV1-Vectors-Custom":
        assert out == "tfidf"
        assert vocab_size == None

        print("Preparing data...")
        all_data = TextRCV1_Vectors(subset="all")
        all_data.preprocess(out="tfidf", **params)

        # Split train/test set
        train = copy.deepcopy(all_data)
        test = copy.deepcopy(all_data)
        split_index = all_data.data.shape[0] // 2  # according to Bruna's paper & Hinton's dropout paper
        train.data, test.data = all_data.data[:split_index], all_data.data[split_index:]
        train.labels, test.labels = all_data.labels[:split_index], all_data.labels[split_index:]

    return train, test


def load_word2vec(filepath, vocabulary, embedding_dim):
    """
    Returns the embedding matrix for vocabulary from filepath.
    """
    # Initialize embedding matrix from pre-trained word2vec embeddings. 0.25 is chosen so that unknown
    # vectors have (approximately) the same variance as pre-trained ones.
    embeddings = np.random.uniform(-0.25, 0.25, (len(vocabulary), embedding_dim))

    words_found = 0
    with open(filepath, "rb") as f:
        header = f.readline()
        word2vec_vocab_size, embedding_size = map(int, header.split())
        binary_len = np.dtype("float32").itemsize * embedding_size
        for line in range(word2vec_vocab_size):
            word = []
            while True:
                ch = f.read(1).decode("latin-1")
                if ch == " ":
                    word = "".join(word)
                    break
                if ch != "\n":
                    word.append(ch)

            idx = vocabulary.get(word, None)
            if idx != None:
                embeddings[idx] = np.fromstring(f.read(binary_len), dtype="float32")
                words_found += 1
            else:
                f.read(binary_len)

    print("Word Embeddings Extracted: {}".format(words_found))
    print("Word Embeddings Randomly Initialized: {}".format(len(vocabulary) - words_found))

    return embeddings


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    indices = collections.deque()
    num_iterations = int(num_epochs * data_size / batch_size)
    for step in range(1, num_iterations + 1):
        if len(indices) < batch_size:
            if shuffle:
                indices.extend(np.random.permutation(data_size))
            else:
                indices.extend(np.arange(data_size))
        idx = [indices.popleft() for i in range(batch_size)]
        yield data[idx]