heavy code restructuing

wiz21b · Apr 26, 2021 · d2276fb · d2276fb
1 parent 9c9db62
commit d2276fb
Show file tree

Hide file tree

Showing 5 changed files with 320 additions and 224 deletions.
diff --git a/p2_LZ77.py b/p2_LZ77.py
@@ -1,3 +1,8 @@
+import math
+import os.path
+import pickle
+from datetime import datetime
+
 def LZ77_encoder(input_text, SWSIZE):
     """ Return a list of (distance, length, character) tuples.
     """
@@ -64,6 +69,35 @@ def peek(ndx):
     return compressed
 
 
+def compute_compression_rate_for_LZ77(tuples, sliding_window_size, genome):
+    dl_bits = math.ceil(math.log2(sliding_window_size))
+    char_bits = 8
+    tuple_bits = char_bits+2*dl_bits
+    compressed_size_in_bits = len(tuples)*tuple_bits
+    compression_rate = len(genome)*8/compressed_size_in_bits
+    return compressed_size_in_bits, compression_rate
+
+
+# The following code is to avoid recompressing the genome
+# each time we run the program.
+
+def lz77_cached_compression(sliding_window_size, genome):
+    cache_name=f"LZ77Cache{sliding_window_size}.dat"
+    if not os.path.exists(cache_name):
+        print(f"Crunching with LZ77, sliding window {sliding_window_size}")
+        chrono = datetime.now()
+        tuples = LZ77_encoder(genome, sliding_window_size)
+        print(f"Compression took {datetime.now() - chrono}")
+        assert "".join(LZ77_decoder(tuples)) == genome, "LZ77 compression went wrong"
+        with open(cache_name,"wb") as fout:
+            pickle.dump(tuples, fout)
+    else:
+        with open(cache_name,"rb") as fin:
+            tuples = pickle.load(fin)
+
+    return tuples
+
+
 def LZ77_decoder(encoded):
     decoded = []
     for d, l, c in encoded:
@@ -91,17 +125,3 @@ def LZ77_decoder(encoded):
     S = "abracadabrad"
     print(S)
     print(LZ77_encoder(S, 7))
-
-    """ Q10. Encode the genome using the LZ77 algorithm. Give the total
-    length of the encoded genome and the compression rate."""
-
-    import numpy as np
-    genome = np.genfromtxt("genome.txt", dtype='str')
-    genome = "".join(genome)
-
-    print("Compressing full genome")
-    compressed = LZ77_encoder(genome, 1024 // 2)
-    print(f"Compression has {len(compressed)} tuples")
-    print("Decompressing")
-    decompressed = LZ77_decoder(compressed)
-    assert "".join(decompressed) == genome
diff --git a/p2_huffman.py b/p2_huffman.py
@@ -0,0 +1,78 @@
+import heapq
+from io import StringIO
+
+
+class Node:
+    def __init__(self, left_child=None, right_child=None, weight=None, symbol=None):
+        self.left_child = left_child
+        self.right_child = right_child
+
+        if self.has_both_children():
+            assert weight is None and symbol is None
+            self.weight = self.left_child.weight + self.right_child.weight
+            self.symbol = None
+        else:
+            assert weight > 0 and symbol is not None, f"Weight={weight}, symbol={symbol}"
+            self.weight = weight
+            self.symbol = symbol
+
+        assert (left_child is None and right_child is None) or self.has_both_children()
+        self.code = None
+
+    def has_both_children(self):
+        return self.left_child is not None and self.right_child is not None
+
+    def __eq__(self, other):
+        return self.weight == other.weight
+
+    def __lt__(self, other):
+        return self.weight < other.weight
+
+
+def build_huffman_tree(symbols_cnts: dict):
+    # Create leaves of the tree
+    nodes = []
+    for symbol, cnt in symbols_cnts.items():
+        nodes.append((cnt, Node(None, None, cnt, symbol)))
+
+    # Order leaves by weights, heapq is a min-heap
+    heapq.heapify(nodes)
+
+    # Build the tree bottom up
+    while len(nodes) > 1:
+        # Pop the two nodes with the lowest weights
+        left = heapq.heappop(nodes)[1]
+        right = heapq.heappop(nodes)[1]
+
+        new_node = Node(left, right)
+        heapq.heappush(nodes, (new_node.weight, new_node))
+
+    # return the remaining node which is the top node
+    # of the tree
+    return nodes[0][1]
+
+
+def compute_leaves_codes(node: Node, prefix=""):
+    if node.has_both_children():
+        a = compute_leaves_codes(node.left_child, prefix + "0")
+        b = compute_leaves_codes(node.right_child, prefix + "1")
+        return a+b
+    else:
+        assert node.left_child is None and node.right_child is None
+        node.code = prefix
+        return [node]
+
+
+def build_codebooks(top_node):
+    # Affect a code to each leaf node
+    d = compute_leaves_codes(top_node, "")
+
+    # Build maps from/to symbol to/from Huffman codes
+    code_map = dict()
+    decode_map = dict()
+    for node in sorted(d, key=lambda n: n.weight):
+        #print(f"{node.symbol} {node.weight:5d} {node.code}")
+        code_map[node.symbol] = node.code
+        decode_map[node.code] = node.symbol
+
+    return code_map, decode_map