Skip to content

Commit

Permalink
heavy code restructuing
Browse files Browse the repository at this point in the history
  • Loading branch information
Stéphane Champailler committed Apr 26, 2021
1 parent 9c9db62 commit d2276fb
Show file tree
Hide file tree
Showing 5 changed files with 320 additions and 224 deletions.
48 changes: 34 additions & 14 deletions p2_LZ77.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import math
import os.path
import pickle
from datetime import datetime

def LZ77_encoder(input_text, SWSIZE):
""" Return a list of (distance, length, character) tuples.
"""
Expand Down Expand Up @@ -64,6 +69,35 @@ def peek(ndx):
return compressed


def compute_compression_rate_for_LZ77(tuples, sliding_window_size, genome):
dl_bits = math.ceil(math.log2(sliding_window_size))
char_bits = 8
tuple_bits = char_bits+2*dl_bits
compressed_size_in_bits = len(tuples)*tuple_bits
compression_rate = len(genome)*8/compressed_size_in_bits
return compressed_size_in_bits, compression_rate


# The following code is to avoid recompressing the genome
# each time we run the program.

def lz77_cached_compression(sliding_window_size, genome):
cache_name=f"LZ77Cache{sliding_window_size}.dat"
if not os.path.exists(cache_name):
print(f"Crunching with LZ77, sliding window {sliding_window_size}")
chrono = datetime.now()
tuples = LZ77_encoder(genome, sliding_window_size)
print(f"Compression took {datetime.now() - chrono}")
assert "".join(LZ77_decoder(tuples)) == genome, "LZ77 compression went wrong"
with open(cache_name,"wb") as fout:
pickle.dump(tuples, fout)
else:
with open(cache_name,"rb") as fin:
tuples = pickle.load(fin)

return tuples


def LZ77_decoder(encoded):
decoded = []
for d, l, c in encoded:
Expand Down Expand Up @@ -91,17 +125,3 @@ def LZ77_decoder(encoded):
S = "abracadabrad"
print(S)
print(LZ77_encoder(S, 7))

""" Q10. Encode the genome using the LZ77 algorithm. Give the total
length of the encoded genome and the compression rate."""

import numpy as np
genome = np.genfromtxt("genome.txt", dtype='str')
genome = "".join(genome)

print("Compressing full genome")
compressed = LZ77_encoder(genome, 1024 // 2)
print(f"Compression has {len(compressed)} tuples")
print("Decompressing")
decompressed = LZ77_decoder(compressed)
assert "".join(decompressed) == genome
78 changes: 78 additions & 0 deletions p2_huffman.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import heapq
from io import StringIO


class Node:
def __init__(self, left_child=None, right_child=None, weight=None, symbol=None):
self.left_child = left_child
self.right_child = right_child

if self.has_both_children():
assert weight is None and symbol is None
self.weight = self.left_child.weight + self.right_child.weight
self.symbol = None
else:
assert weight > 0 and symbol is not None, f"Weight={weight}, symbol={symbol}"
self.weight = weight
self.symbol = symbol

assert (left_child is None and right_child is None) or self.has_both_children()
self.code = None

def has_both_children(self):
return self.left_child is not None and self.right_child is not None

def __eq__(self, other):
return self.weight == other.weight

def __lt__(self, other):
return self.weight < other.weight


def build_huffman_tree(symbols_cnts: dict):
# Create leaves of the tree
nodes = []
for symbol, cnt in symbols_cnts.items():
nodes.append((cnt, Node(None, None, cnt, symbol)))

# Order leaves by weights, heapq is a min-heap
heapq.heapify(nodes)

# Build the tree bottom up
while len(nodes) > 1:
# Pop the two nodes with the lowest weights
left = heapq.heappop(nodes)[1]
right = heapq.heappop(nodes)[1]

new_node = Node(left, right)
heapq.heappush(nodes, (new_node.weight, new_node))

# return the remaining node which is the top node
# of the tree
return nodes[0][1]


def compute_leaves_codes(node: Node, prefix=""):
if node.has_both_children():
a = compute_leaves_codes(node.left_child, prefix + "0")
b = compute_leaves_codes(node.right_child, prefix + "1")
return a+b
else:
assert node.left_child is None and node.right_child is None
node.code = prefix
return [node]


def build_codebooks(top_node):
# Affect a code to each leaf node
d = compute_leaves_codes(top_node, "")

# Build maps from/to symbol to/from Huffman codes
code_map = dict()
decode_map = dict()
for node in sorted(d, key=lambda n: n.weight):
#print(f"{node.symbol} {node.weight:5d} {node.code}")
code_map[node.symbol] = node.code
decode_map[node.code] = node.symbol

return code_map, decode_map
Loading

0 comments on commit d2276fb

Please sign in to comment.