Merge pull request #24 from prasannababuAddagiri/master

wordvecspace interact feature
deep-compute · Jan 20, 2018 · 622859e · 622859e
2 parents d0804ed + 1538b5c
commit 622859e
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -63,19 +63,36 @@ $ wordvecspace convert <input_dir> <output_dir>
 # You can also generate shards by specifying number of vectors per each shard
 $ wordvecspace convert <input_dir> <output_dir> -n 5000
 ```
+### Interactive console
+```bash
+$ wordvecspace interact <input_dir>
+
+# <input_dir> is the directory which has vocab.txt and vectors.npy
+```
+Example:
+```bash
+$ wordvecspace interact /home/user/data
+
+Total number of vectors and dimensions in .npy file (71291, 5)
 
+>>> help
+['DEFAULT_K', 'VECTOR_FNAME', 'VOCAB_FNAME', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_load_vocab', '_make_array', '_perform_dot', '_perform_sgemm', '_perform_sgemv', 'data_dir', 'does_word_exist', 'get_distance', 'get_distances', 'get_nearest_neighbors', 'get_vector_magnitudes', 'get_word_at_index', 'get_word_index', 'get_word_occurrences', 'get_word_vector', 'get_word_vectors', 'load', 'magnitudes', 'num_dimensions', 'num_vectors', 'vectors', 'word_indices', 'word_occurrences', 'words']
+
+WordVecSpace console
+>>> wv = WordVecSpace
+
+```
 ### Importing
 ```python
 >>> from wordvecspace import WordVecSpace
 ```
 
-### Loading data (Vector and Vocab information)
+#### Loading data (Vector and Vocab information)
 ```python
 >>> wv = WordVecSpace('/path/to/data/')
 >>> wv.load()
 ```
-
-### Check if a word exists or not in the word vector space
+#### Check if a word exists or not in the word vector space
 ```python
 >>> print wv.does_word_exist("india")
 True
@@ -84,7 +101,7 @@ True
 False
 ```
 
-### Get the index of a word
+#### Get the index of a word
 ```python
 >>> print wv.get_word_index("india")
 509
@@ -101,7 +118,7 @@ Traceback (most recent call last):
 wordvecspace.UnknownWord: "inidia"
 ```
 
-### Get vector for given word or index
+#### Get vector for given word or index
 ```python
 # Get the word vector for a word india
 
@@ -134,13 +151,13 @@ wordvecspace.UnknownWord: "inidia"
 [ 0.  0.  0.  0.  0.]
 ```
 
-### Get Word at Index 
+#### Get Word at Index 
 ```python
 # Get word at Index 509
 >>> print wv.get_word_at_index(509)
 india
 ```
-### Get occurrences of the word 
+#### Get occurrences of the word 
 ```python
 # Get occurrences of the word "india"
 >>> print wv.get_word_occurrences("india")
@@ -165,7 +182,7 @@ Traceback (most recent call last):
 wordvecspace.UnknownWord: "inidia"
 ```
 
-### Get Vector magnitude of the word 
+#### Get Vector magnitude of the word 
 ```python
 # Get Vector magnitude of the word "india"
 >>> print wv.get_vector_magnitudes("india")
@@ -184,15 +201,15 @@ wordvecspace.UnknownWord: "inidia"
 [ 0.          7.3621]
 ```	
 
-### Get vectors for list of words
+#### Get vectors for list of words
 ```python
 # Get vectors for list of words ["usa", "india"]
 >>> print wv.get_word_vectors(["usa", "india"])
 [[-0.7216 -0.0557  0.4108  0.5494  0.0741]
  [-0.6259 -0.21    0.5559 -0.3664  0.3478]]
 ```
 
-### Get distance between two words 
+#### Get distance between two words 
 ```python
 # Get distance between "india", "usa"
 >>> print wv.get_distance("india", "usa")
@@ -203,7 +220,7 @@ wordvecspace.UnknownWord: "inidia"
 1.16397565603
 ```
 
-### Get distance between list of words
+#### Get distance between list of words
 ```python
 >>> print wv.get_distances("for", ["to", "for", "india"])
 [[  1.4990e-01]
@@ -227,7 +244,7 @@ wordvecspace.UnknownWord: "inidia"
 [[ 1.3432  0.5781  0.2306 ...,  1.0937  1.1369  0.4284]]
 ```
 
-### Get nearest neighbors 
+#### Get nearest neighbors 
 ```python
 # Get nearest neighbours for given word or index
 >>> print wv.get_nearest_neighbors("india", 20)

diff --git a/setup.py b/setup.py
@@ -1,28 +1,6 @@
 from setuptools import setup, find_packages
-import os
 
-HERE = os.path.abspath(os.path.dirname(__file__))
-def get_long_description():
-    dirs = [ HERE ]
-    if os.getenv("TRAVIS"):
-        dirs.append(os.getenv("TRAVIS_BUILD_DIR"))
-
-    long_description = ""
-
-    for d in dirs:
-        rst_readme = os.path.join(d, "README.rst")
-        if not os.path.exists(rst_readme):
-            continue
-
-        with open(rst_readme) as fp:
-            long_description = fp.read()
-            return long_description
-
-    return long_description
-
-long_description = get_long_description()
-
-version = '0.4.1'
+version = '0.4.2'
 setup(
     name="wordvecspace",
     version=version,
@@ -31,7 +9,6 @@ def get_long_description():
                 " created using Google's Word2vec tool. It also supports"
                 " converting word vector space data (vectors and vocabulary)"
                 " from Google Word2Vec format to WordVecSpace format.",
-    long_description=long_description,
     keywords='wordvecspace',
     author='Deep Compute, LLC',
     author_email="[email protected]",

diff --git a/wordvecspace/command.py b/wordvecspace/command.py
@@ -1,5 +1,8 @@
+import code
+
 from basescript import BaseScript
 from convert import GWVec2WordVecSpace
+from wordvecspace import WordVecSpace
 
 class WordVecSpaceCommand(BaseScript):
     DESC = 'Word Vector Space command-line tool'
@@ -13,6 +16,21 @@ def convert(self):
 
     DEFAULT_NUM_VECS_PER_SHARD = 0
 
+    def interact(self):
+	interact = WordVecSpace(self.args.input_dir)
+        interact.load()
+
+	vectors = interact.num_vectors
+	dimensions = interact.num_dimensions
+
+	print "Total number of vectors and dimensions in .npy file (%s, %s)" %(vectors, dimensions)
+	print ""
+	print "help"
+	print "%s" %(dir(interact))
+
+	namespace=dict(WordVecSpace=interact)
+    	code.interact("WordVecSpace Console", local=namespace)
+
     def define_subcommands(self, subcommands):
         super(WordVecSpaceCommand, self).define_subcommands(subcommands)
 
@@ -21,13 +39,20 @@ def define_subcommands(self, subcommands):
         convert_cmd.set_defaults(func=self.convert)
         convert_cmd.add_argument('input_dir',
             help='Input directory containing Google Word2Vec format files'
-                 ' (vocab.txt, w2v_vectors.bin)')
+                 ' (vocab.txt, vectors.bin)')
         convert_cmd.add_argument('output_dir',
             help='Output directory where WordVecSpace format files are produced')
         convert_cmd.add_argument('-n', '--num-vecs-per-shard',
             default=self.DEFAULT_NUM_VECS_PER_SHARD, type=int,
             help='Number of vectors per shard. 0 value ensures all vecs in one shard.')
 
+        interact_cmd = subcommands.add_parser('interact',
+                help='WordVecSpace Console')
+        interact_cmd.set_defaults(func=self.interact)
+        interact_cmd.add_argument('input_dir',
+                help='Input directory containing WordVecSpace format files'
+                ' (vocab.txt, vectors.npy)')
+
 def main():
     WordVecSpaceCommand().start()