-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkeywordextractionscript.py
1 lines (1 loc) · 14 KB
/
keywordextractionscript.py
1
{"cells":[{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"from __future__ import unicode_literals, print_function, division\nfrom io import open\nimport unicodedata\nimport string\nimport re\nimport random\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nfrom torch import optim\nimport torch.nn.functional as F\nimport sys\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session\n\nuse_cuda = True\nTRAIN = True\n\nfor arg in sys.argv:\n if arg == '--train':\n TRAIN = True\n elif arg == '--cuda':\n use_cuda = torch.cuda.is_available()\n\nprint(\"CUDA : \", use_cuda)\nprint(\"TRAIN: \", TRAIN)\n\nSOS_token = 0\nEOS_token = 1\n\nclass Lang:\n\n def __init__(self, name):\n self.name = name\n self.word2index = {}\n self.word2count = {}\n self.index2word = {0: \"SOS\", 1: \"EOS\"}\n self.n_words = 2\n\n def add_sentence(self, sentence):\n for word in sentence.split(' '):\n self.add_word(word)\n\n def add_word(self, word):\n if word not in self.word2index:\n self.word2index[word] = self.n_words\n self.word2count[word] = 1\n self.index2word[self.n_words] = word\n self.n_words += 1\n else:\n self.word2count[word] += 1\n\ndef unicode2ascii(s):\n return ''.join(\n c for c in unicodedata.normalize('NFD', s)\n if unicodedata.category(c) != 'Mn'\n )\n\ndef normalize_string(s):\n s = unicode2ascii(s.lower().strip())\n s = re.sub(r\"([.!?])\", r\" \\1\", s)\n s = re.sub(r\"[^a-z_AZ.!?]+\", r\" \", s)\n\n return s\n\ndef read_langs(lang1,lang2, reverse = False):\n print(\"Reading lines\")\n \n #read the file and split into lines\n lines = open('../input/titles2keywords/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\\n')\n \n #split every line into pairs and normalize\n pairs = [[normalize_string(s) for s in l.split('\\t')] for l in lines]\n \n #reverse pairs, make Language instances\n if reverse:\n pairs = [list(reversed(p)) for p in pairs]\n input_lang = Lang(lang2)\n output_lang = Lang(lang1)\n else:\n input_lang = Lang(lang1)\n output_lang = Lang(lang2)\n \n return input_lang, output_lang, pairs\n\nMAX_LENGTH = 512\n\ndef filter_pair(p):\n return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH\n\ndef filter_pairs(pairs):\n return [pair for pair in pairs if filter_pair(pair)]\n\ndef prepare_data(lang1, lang2, reverse=False):\n input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)\n pairs = filter_pairs(pairs)\n\n for pair in pairs:\n input_lang.add_sentence(pair[0])\n output_lang.add_sentence(pair[1])\n\n return input_lang, output_lang, pairs\n\ninput_lang, output_lang, pairs = prepare_data('keyword', 'data', False)\n\nclass EncoderRNN(nn.Module):\n\n def __init__(self, input_size, hidden_size):\n super(EncoderRNN, self).__init__()\n\n self.hidden_size = hidden_size\n self.embedding = nn.Embedding(input_size, hidden_size)\n self.gru = nn.GRU(hidden_size, hidden_size)\n\n def forward(self, input, hidden):\n embedded = self.embedding(input).view(1, 1, -1)\n output = embedded\n output, hidden = self.gru(output, hidden)\n\n return output, hidden\n\n def init_hidden(self):\n result = Variable(torch.zeros(1, 1, self.hidden_size))\n\n if use_cuda:\n return result.cuda()\n else:\n return result\n\nclass DecoderRNN(nn.Module):\n\n def __init__(self, hidden_size, output_size):\n super(DecoderRNN, self).__init__()\n\n self.hidden_size = hidden_size\n self.embedding = nn.Embedding(output_size, hidden_size)\n self.gru = nn.GRU(hidden_size, hidden_size)\n self.out = nn.Linear(hidden_size, output_size)\n self.softmax = nn.LogSoftmax(dim=1)\n\n def forward(self, input, hidden):\n output = self.embedding(input).view(1, 1, -1)\n output = F.relu(output)\n output, hidden = self.gru(output, hidden)\n output = self.softmax(self.out(output[0]))\n\n return output, hidden\n\n def init_hidden(self):\n result = Variable(torch.zeros(1, 1, self.hidden_size))\n\n if use_cuda:\n return result.cuda()\n else:\n return result\n\nclass AttnDecoderRNN(nn.Module):\n\n def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):\n super(AttnDecoderRNN, self).__init__()\n\n self.hidden_size = hidden_size\n self.output_size = output_size\n self.dropout_p = dropout_p\n self.max_length = max_length\n self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n self.dropout = nn.Dropout(self.dropout_p)\n self.gru = nn.GRU(self.hidden_size, self.hidden_size)\n self.out = nn.Linear(self.hidden_size, self.output_size)\n\n def forward(self, input, hidden, encoder_outputs):\n embedded = self.embedding(input).view(1, 1, -1)\n embedded = self.dropout(embedded)\n attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)\n attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))\n output = torch.cat((embedded[0], attn_applied[0]), 1)\n output = self.attn_combine(output).unsqueeze(0)\n output = F.relu(output)\n output, hidden = self.gru(output, hidden)\n output = F.log_softmax(self.out(output[0]), dim=1)\n\n return output, hidden, attn_weights\n\n def init_hidden(self):\n result = Variable(torch.zeros(1, 1, self.hidden_size))\n\n if use_cuda:\n return result.cuda()\n else:\n return result\n\ndef indexes_from_sentence(lang, sentence):\n return [lang.word2index[word] for word in sentence.split(' ')]\n\ndef variable_from_sentence(lang, sentence):\n indexes = indexes_from_sentence(lang, sentence)\n indexes.append(EOS_token)\n\n result = Variable(torch.LongTensor(indexes).view(-1, 1))\n\n if use_cuda:\n return result.cuda()\n else:\n return result\n\ndef variables_from_pair(pair):\n input_variable = variable_from_sentence(input_lang, pair[0])\n target_variable = variable_from_sentence(output_lang, pair[1])\n\n return (input_variable, target_variable)\n\nteacher_forcing_ratio = 0.5\n\ndef train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):\n encoder_hidden = encoder.init_hidden()\n\n encoder_optimizer.zero_grad()\n decoder_optimizer.zero_grad()\n\n input_length = input_variable.size()[0]\n target_length = target_variable.size()[0]\n encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))\n encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs\n loss = 0\n\n for ei in range(input_length):\n encoder_output, encoder_hidden = encoder(input_variable[ei], encoder_hidden)\n encoder_outputs[ei] = encoder_output[0][0]\n\n decoder_input = Variable(torch.LongTensor([[SOS_token]]))\n decoder_input = decoder_input.cuda() if use_cuda else decoder_input\n decoder_hidden = encoder_hidden\n\n use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n\n if use_teacher_forcing:\n for di in range(target_length):\n decoder_output, decoder_hidden, decoder_attention = decoder(\n decoder_input, decoder_hidden, encoder_outputs\n )\n\n loss += criterion(decoder_output, target_variable[di])\n decoder_input = target_variable[di]\n else:\n for di in range(target_length):\n decoder_output, decoder_hidden, decoder_attention = decoder(\n decoder_input, decoder_hidden, encoder_outputs\n )\n\n topv, topi = decoder_output.data.topk(1)\n ni = topi[0][0]\n decoder_input = Variable(torch.LongTensor([[ni]]))\n decoder_input = decoder_input.cuda() if use_cuda else decoder_input\n loss += criterion(decoder_output, target_variable[di])\n\n if ni == EOS_token:\n break\n\n loss.backward()\n\n encoder_optimizer.step()\n decoder_optimizer.step()\n\n return loss.data / target_length\n\nimport time\nimport math\n\ndef as_minutes(s):\n m = math.floor(s / 60)\n s -= m * 60\n\n return '%dm %ds' % (m, s)\n\ndef time_since(since, percent):\n now = time.time()\n s = now - since\n es = s / (percent)\n rs = es - s\n\n return '%s (- %s)' % (as_minutes(s), as_minutes(rs))\n\ndef train_interations(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):\n start = time.time()\n print_loss_total = 0\n\n encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n training_pairs = [variables_from_pair(random.choice(pairs)) for i in range(n_iters)]\n criterion = nn.NLLLoss()\n\n for iter in range(1, n_iters + 1):\n training_pair = training_pairs[iter - 1]\n input_variable = training_pair[0]\n target_variable = training_pair[1]\n\n loss = train(\n input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion\n )\n\n print_loss_total += loss\n\n if iter % print_every == 0:\n print_loss_avg = print_loss_total / print_every\n print_loss_total = 0\n\n print('%s (%d %d%%) %.4f' % (\n time_since(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg\n ))\n\nimport numpy as np\n\ndef evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):\n input_variable = variable_from_sentence(input_lang, sentence)\n input_length = input_variable.size()[0]\n encoder_hidden = encoder.init_hidden()\n encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))\n encoder_outputs = encoder_ouputs.cuda() if use_cuda else encoder_ouputs\n\n for ei in range(input_length):\n encoder_output, encoder_hidden = encoder(input_variable[ei], encoder_hidden)\n encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]\n\n decoder_input = Variable(torch.LongTensor([[SOS_token]]))\n decoder_input = decoder_input.cuda() if use_cuda else decoder_input\n decoder_hidden = encoder_hidden\n decoded_words = []\n decoder_attentions = torch.zeros(max_length, max_length)\n\n for di in range(max_length):\n decoder_output, decoder_hidden, decoder_attention = decoder(\n decoder_input, decoder_hidden, encoder_outputs\n )\n\n decoder_attentions[di] = decoder_attention.data\n topv, topi = decoder_output.data.topk(1)\n ni = topi[0][0]\n\n if ni == EOS_token:\n decoded_words.append('<EOS>')\n break\n else:\n decoded_words.append(output_lang.index2word[ni])\n\n decoder_input = Variable(torch.LongTensor([[ni]]))\n decoder_input = decoder_input.cuda() if use_cuda else decoder_input\n\n return decoded_words, decoder_attentions[:di + 1]\n\nhidden_size = 256\n\nif TRAIN is True:\n print(\"TRAINING...\")\n\n encoder1 = EncoderRNN(input_lang.n_words, hidden_size)\n attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1)\n\n if use_cuda:\n encoder1 = encoder1.cuda()\n attn_decoder1 = attn_decoder1.cuda()\n\n train_interations(encoder1, attn_decoder1, 75000, print_every=5000)\n\n torch.save(encoder1, 'encoder.pt')\n torch.save(attn_decoder1, 'decoder.pt')\nelse:\n print(\"LOADING...\")\n\n encoder1 = torch.load('encoder.pt')\n attn_decoder1 = torch.load('decoder.pt')\n\ndef output_evaluation(input_sentence):\n output_words, attentions = evaluate(\n encoder1, attn_decoder1, input_sentence\n )\n\n print(\"input = \", input_sentence)\n print(\"output = \", ' '.join(output_words))\n\nwhile(True):\n try:\n inp = raw_input(\">\")\n output_evaluation(inp)\n except KeyError:\n pass","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4}