Skip to content

Commit

Permalink
Merge pull request #24 from prasannababuAddagiri/master
Browse files Browse the repository at this point in the history
wordvecspace interact feature
  • Loading branch information
prasannababuAddagiri authored Jan 20, 2018
2 parents d0804ed + 1538b5c commit 622859e
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 37 deletions.
41 changes: 29 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,36 @@ $ wordvecspace convert <input_dir> <output_dir>
# You can also generate shards by specifying number of vectors per each shard
$ wordvecspace convert <input_dir> <output_dir> -n 5000
```
### Interactive console
```bash
$ wordvecspace interact <input_dir>

# <input_dir> is the directory which has vocab.txt and vectors.npy
```
Example:
```bash
$ wordvecspace interact /home/user/data

Total number of vectors and dimensions in .npy file (71291, 5)

>>> help
['DEFAULT_K', 'VECTOR_FNAME', 'VOCAB_FNAME', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_load_vocab', '_make_array', '_perform_dot', '_perform_sgemm', '_perform_sgemv', 'data_dir', 'does_word_exist', 'get_distance', 'get_distances', 'get_nearest_neighbors', 'get_vector_magnitudes', 'get_word_at_index', 'get_word_index', 'get_word_occurrences', 'get_word_vector', 'get_word_vectors', 'load', 'magnitudes', 'num_dimensions', 'num_vectors', 'vectors', 'word_indices', 'word_occurrences', 'words']

WordVecSpace console
>>> wv = WordVecSpace

```
### Importing
```python
>>> from wordvecspace import WordVecSpace
```

### Loading data (Vector and Vocab information)
#### Loading data (Vector and Vocab information)
```python
>>> wv = WordVecSpace('/path/to/data/')
>>> wv.load()
```

### Check if a word exists or not in the word vector space
#### Check if a word exists or not in the word vector space
```python
>>> print wv.does_word_exist("india")
True
Expand All @@ -84,7 +101,7 @@ True
False
```

### Get the index of a word
#### Get the index of a word
```python
>>> print wv.get_word_index("india")
509
Expand All @@ -101,7 +118,7 @@ Traceback (most recent call last):
wordvecspace.UnknownWord: "inidia"
```

### Get vector for given word or index
#### Get vector for given word or index
```python
# Get the word vector for a word india

Expand Down Expand Up @@ -134,13 +151,13 @@ wordvecspace.UnknownWord: "inidia"
[ 0. 0. 0. 0. 0.]
```

### Get Word at Index
#### Get Word at Index
```python
# Get word at Index 509
>>> print wv.get_word_at_index(509)
india
```
### Get occurrences of the word
#### Get occurrences of the word
```python
# Get occurrences of the word "india"
>>> print wv.get_word_occurrences("india")
Expand All @@ -165,7 +182,7 @@ Traceback (most recent call last):
wordvecspace.UnknownWord: "inidia"
```

### Get Vector magnitude of the word
#### Get Vector magnitude of the word
```python
# Get Vector magnitude of the word "india"
>>> print wv.get_vector_magnitudes("india")
Expand All @@ -184,15 +201,15 @@ wordvecspace.UnknownWord: "inidia"
[ 0. 7.3621]
```

### Get vectors for list of words
#### Get vectors for list of words
```python
# Get vectors for list of words ["usa", "india"]
>>> print wv.get_word_vectors(["usa", "india"])
[[-0.7216 -0.0557 0.4108 0.5494 0.0741]
[-0.6259 -0.21 0.5559 -0.3664 0.3478]]
```

### Get distance between two words
#### Get distance between two words
```python
# Get distance between "india", "usa"
>>> print wv.get_distance("india", "usa")
Expand All @@ -203,7 +220,7 @@ wordvecspace.UnknownWord: "inidia"
1.16397565603
```

### Get distance between list of words
#### Get distance between list of words
```python
>>> print wv.get_distances("for", ["to", "for", "india"])
[[ 1.4990e-01]
Expand All @@ -227,7 +244,7 @@ wordvecspace.UnknownWord: "inidia"
[[ 1.3432 0.5781 0.2306 ..., 1.0937 1.1369 0.4284]]
```

### Get nearest neighbors
#### Get nearest neighbors
```python
# Get nearest neighbours for given word or index
>>> print wv.get_nearest_neighbors("india", 20)
Expand Down
25 changes: 1 addition & 24 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,6 @@
from setuptools import setup, find_packages
import os

HERE = os.path.abspath(os.path.dirname(__file__))
def get_long_description():
dirs = [ HERE ]
if os.getenv("TRAVIS"):
dirs.append(os.getenv("TRAVIS_BUILD_DIR"))

long_description = ""

for d in dirs:
rst_readme = os.path.join(d, "README.rst")
if not os.path.exists(rst_readme):
continue

with open(rst_readme) as fp:
long_description = fp.read()
return long_description

return long_description

long_description = get_long_description()

version = '0.4.1'
version = '0.4.2'
setup(
name="wordvecspace",
version=version,
Expand All @@ -31,7 +9,6 @@ def get_long_description():
" created using Google's Word2vec tool. It also supports"
" converting word vector space data (vectors and vocabulary)"
" from Google Word2Vec format to WordVecSpace format.",
long_description=long_description,
keywords='wordvecspace',
author='Deep Compute, LLC',
author_email="[email protected]",
Expand Down
27 changes: 26 additions & 1 deletion wordvecspace/command.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import code

from basescript import BaseScript
from convert import GWVec2WordVecSpace
from wordvecspace import WordVecSpace

class WordVecSpaceCommand(BaseScript):
DESC = 'Word Vector Space command-line tool'
Expand All @@ -13,6 +16,21 @@ def convert(self):

DEFAULT_NUM_VECS_PER_SHARD = 0

def interact(self):
interact = WordVecSpace(self.args.input_dir)
interact.load()

vectors = interact.num_vectors
dimensions = interact.num_dimensions

print "Total number of vectors and dimensions in .npy file (%s, %s)" %(vectors, dimensions)
print ""
print "help"
print "%s" %(dir(interact))

namespace=dict(WordVecSpace=interact)
code.interact("WordVecSpace Console", local=namespace)

def define_subcommands(self, subcommands):
super(WordVecSpaceCommand, self).define_subcommands(subcommands)

Expand All @@ -21,13 +39,20 @@ def define_subcommands(self, subcommands):
convert_cmd.set_defaults(func=self.convert)
convert_cmd.add_argument('input_dir',
help='Input directory containing Google Word2Vec format files'
' (vocab.txt, w2v_vectors.bin)')
' (vocab.txt, vectors.bin)')
convert_cmd.add_argument('output_dir',
help='Output directory where WordVecSpace format files are produced')
convert_cmd.add_argument('-n', '--num-vecs-per-shard',
default=self.DEFAULT_NUM_VECS_PER_SHARD, type=int,
help='Number of vectors per shard. 0 value ensures all vecs in one shard.')

interact_cmd = subcommands.add_parser('interact',
help='WordVecSpace Console')
interact_cmd.set_defaults(func=self.interact)
interact_cmd.add_argument('input_dir',
help='Input directory containing WordVecSpace format files'
' (vocab.txt, vectors.npy)')

def main():
WordVecSpaceCommand().start()

Expand Down

0 comments on commit 622859e

Please sign in to comment.