diff --git a/.gitignore b/.gitignore index 803371c..f46cfe8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .ipynb_checkpoints *.pyc data/records*.txt +data/records*.h5 env/ diff --git a/munch/__init__.py b/munch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/munch/convert_records.py b/munch/convert_records.py new file mode 100644 index 0000000..087d6a9 --- /dev/null +++ b/munch/convert_records.py @@ -0,0 +1,53 @@ +from mutable import Converter + +import numpy as np +import h5py +import sys + +data_dir = sys.argv[1] + +c = Converter() + +def add_to_set(dset, data, inx_prev): + inx_cur = inx_prev + len(data) + print(inx_prev, inx_cur) + dset[inx_prev:inx_cur] = data + + return inx_cur + +def convert_set(path, hf): + print() + print(path) + records = open(path) + + games = records.readlines() + moves = sum([(record.count(";") + 1) for record in games]) + print(len(games)) + print(moves) + + dset = hf.create_dataset("samples", shape=(moves, 9, 9, 9), dtype=np.uint8) + + cur = 0 + inx_prev = 0 + data = [] + for record in games: + cur += 1 + data.extend(c.convert(record.strip())) + if cur % 1000 == 0: + inx_prev = add_to_set(dset, data, inx_prev) + data = [] + + add_to_set(dset, data, inx_prev) + +def convert_all(): + data_sets = [ + "records-1-train", + "records-2-valid", + "records-3-test", + ] + + for ds in data_sets: + with h5py.File("%s%s.h5" % (data_dir, ds), "w") as hf: + convert_set("%s%s.txt" % (data_dir, ds), hf) + +convert_all() diff --git a/munch/process_games.sh b/munch/process_games.sh index 8b5ad0b..6ef8f3a 100755 --- a/munch/process_games.sh +++ b/munch/process_games.sh @@ -10,3 +10,5 @@ cat "$DATADIR"/records-all.txt | sort | uniq -u > "$DATADIR"/records-clean.txt # create train/valid/test data sets python split_records.py "$DATADIR" + +python convert_records.py "$DATADIR" diff --git a/requirements.txt b/requirements.txt index 6271e40..0d279f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ jupyter==1.0.0 numpy==1.14.1 +h5py==2.7.1 Keras==2.1.5