-
-
Notifications
You must be signed in to change notification settings - Fork 46.1k
/
lz77.py
225 lines (179 loc) · 7.86 KB
/
lz77.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
LZ77 compression algorithm
- lossless data compression published in papers by Abraham Lempel and Jacob Ziv in 1977
- also known as LZ1 or sliding-window compression
- form the basis for many variations including LZW, LZSS, LZMA and others
It uses a “sliding window” method. Within the sliding window we have:
- search buffer
- look ahead buffer
len(sliding_window) = len(search_buffer) + len(look_ahead_buffer)
LZ77 manages a dictionary that uses triples composed of:
- Offset into search buffer, it's the distance between the start of a phrase and
the beginning of a file.
- Length of the match, it's the number of characters that make up a phrase.
- The indicator is represented by a character that is going to be encoded next.
As a file is parsed, the dictionary is dynamically updated to reflect the compressed
data contents and size.
Examples:
"cabracadabrarrarrad" <-> [(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'),
(3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]
"ababcbababaa" <-> [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')]
"aacaacabcabaaac" <-> [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')]
Sources:
en.wikipedia.org/wiki/LZ77_and_LZ78
"""
from dataclasses import dataclass
__version__ = "0.1"
__author__ = "Lucia Harcekova"
@dataclass
class Token:
"""
Dataclass representing triplet called token consisting of length, offset
and indicator. This triplet is used during LZ77 compression.
"""
offset: int
length: int
indicator: str
def __repr__(self) -> str:
"""
>>> token = Token(1, 2, "c")
>>> repr(token)
'(1, 2, c)'
>>> str(token)
'(1, 2, c)'
"""
return f"({self.offset}, {self.length}, {self.indicator})"
class LZ77Compressor:
"""
Class containing compress and decompress methods using LZ77 compression algorithm.
"""
def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> None:
self.window_size = window_size
self.lookahead_buffer_size = lookahead_buffer_size
self.search_buffer_size = self.window_size - self.lookahead_buffer_size
def compress(self, text: str) -> list[Token]:
"""
Compress the given string text using LZ77 compression algorithm.
Args:
text: string to be compressed
Returns:
output: the compressed text as a list of Tokens
>>> lz77_compressor = LZ77Compressor()
>>> str(lz77_compressor.compress("ababcbababaa"))
'[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]'
>>> str(lz77_compressor.compress("aacaacabcabaaac"))
'[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]'
"""
output = []
search_buffer = ""
# while there are still characters in text to compress
while text:
# find the next encoding phrase
# - triplet with offset, length, indicator (the next encoding character)
token = self._find_encoding_token(text, search_buffer)
# update the search buffer:
# - add new characters from text into it
# - check if size exceed the max search buffer size, if so, drop the
# oldest elements
search_buffer += text[: token.length + 1]
if len(search_buffer) > self.search_buffer_size:
search_buffer = search_buffer[-self.search_buffer_size :]
# update the text
text = text[token.length + 1 :]
# append the token to output
output.append(token)
return output
def decompress(self, tokens: list[Token]) -> str:
"""
Convert the list of tokens into an output string.
Args:
tokens: list containing triplets (offset, length, char)
Returns:
output: decompressed text
Tests:
>>> lz77_compressor = LZ77Compressor()
>>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'),
... Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'),
... Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')])
'cabracadabrarrarrad'
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'),
... Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')])
'ababcbababaa'
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'),
... Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')])
'aacaacabcabaaac'
"""
output = ""
for token in tokens:
for _ in range(token.length):
output += output[-token.offset]
output += token.indicator
return output
def _find_encoding_token(self, text: str, search_buffer: str) -> Token:
"""Finds the encoding token for the first character in the text.
Tests:
>>> lz77_compressor = LZ77Compressor()
>>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset
7
>>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length
1
>>> lz77_compressor._find_encoding_token("abc", "xyz").offset
0
>>> lz77_compressor._find_encoding_token("", "xyz").offset
Traceback (most recent call last):
...
ValueError: We need some text to work with.
>>> lz77_compressor._find_encoding_token("abc", "").offset
0
"""
if not text:
raise ValueError("We need some text to work with.")
# Initialise result parameters to default values
length, offset = 0, 0
if not search_buffer:
return Token(offset, length, text[length])
for i, character in enumerate(search_buffer):
found_offset = len(search_buffer) - i
if character == text[0]:
found_length = self._match_length_from_index(text, search_buffer, 0, i)
# if the found length is bigger than the current or if it's equal,
# which means it's offset is smaller: update offset and length
if found_length >= length:
offset, length = found_offset, found_length
return Token(offset, length, text[length])
def _match_length_from_index(
self, text: str, window: str, text_index: int, window_index: int
) -> int:
"""Calculate the longest possible match of text and window characters from
text_index in text and window_index in window.
Args:
text: _description_
window: sliding window
text_index: index of character in text
window_index: index of character in sliding window
Returns:
The maximum match between text and window, from given indexes.
Tests:
>>> lz77_compressor = LZ77Compressor(13, 6)
>>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4)
5
>>> lz77_compressor._match_length_from_index("adabrarrarrad",
... "cabrac", 0, 1)
1
"""
if not text or text[text_index] != window[window_index]:
return 0
return 1 + self._match_length_from_index(
text, window + text[text_index], text_index + 1, window_index + 1
)
if __name__ == "__main__":
from doctest import testmod
testmod()
# Initialize compressor class
lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6)
# Example
TEXT = "cabracadabrarrarrad"
compressed_text = lz77_compressor.compress(TEXT)
print(lz77_compressor.compress("ababcbababaa"))
decompressed_text = lz77_compressor.decompress(compressed_text)
assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result."