Language:
Lua     Change language:
Pastebin: 87899
Author: Galmok
Subject: Huffman compression algorithm
Created: 2008-05-15 21:08:18
Download and save
Toggle line numbers
1---------------------------------------------------------------------------------- 
2-- 
3-- huffman.lua 
4-- 
5-- Author: Galmok of European Stormrage (Horde) 
6-- Email : galmok@gmail.com 
7-- Licence: GPL version 2 (General Public License) 
8---------------------------------------------------------------------------------- 
9 
10local function addCode(tree, bcode,len) 
11    if tree then 
12        tree.bcode = bcode; 
13        tree.blength = len; 
14        if tree.c1 then 
15            addCode(tree.c1, bit.bor(bcode, bit.lshift(1,len)), len+1
16        end 
17        if tree.c2 then 
18            addCode(tree.c2, bcode, len+1
19        end 
20    end 
21end 
22 
23local function escape_code(code, len) 
24    local escaped_code = 0
25    local b; 
26    local l = 0
27    for i = len-1, 0,- 1 do 
28        b = bit.band( code, bit.lshift(1,i))==0 and 0 or 1 
29        escaped_code = bit.lshift(escaped_code,1+b)+b 
30        l = l+b; 
31    end 
32    return escaped_code, len+l 
33end 
34 
35local remainder = 0
36local remainder_length = 0
37local function addBits(tbl, code, len) 
38    remainder = remainder + bit.lshift(code, remainder_length) 
39    remainder_length = len + remainder_length 
40 
41    while remainder_length>=8 do 
42        table.insert(tbl, string.char(bit.band(remainder, 255))) 
43        remainder = bit.rshift(remainder, 8
44        remainder_length = remainder_length -8 
45    end 
46end 
47 
48-- word size for this huffman algorithm is 8 bits (1 byte). This means the best compression is representing 1 byte with 1 bit, i.e. compress to 0.125 of original size. 
49function Hencode(uncompressed) 
50    if not type(uncompressed)=="string" then 
51        return nil, "Can only compress strings" 
52    end 
53 
54    -- make histogram 
55    local hist = {} 
56    local n = 0 
57    -- dont have to use all datat to make the histogram 
58    local uncompressed_size = string.len(uncompressed) 
59    local c; 
60    for i = 1, uncompressed_size do 
61        c=string.byte(uncompressed,i ) 
62        hist[c]=(hist[c] or 0)+1 
63    end 
64 
65    --Start with as many leaves as there are symbols. 
66    local leafs = {} 
67    local leaf; 
68    local symbols = {} 
69    for symbol, weight in pairs(hist) do 
70        leaf = { symbol=string.char(symbol), weight=weight }
71        symbols[symbol] = leaf; 
72        table.insert(leafs, leaf) 
73    end 
74    --Enqueue all leaf nodes into the first queue (by probability in increasing order so that the least likely item is in the head of the queue). 
75    sort(leafs, function(a,b) if a.weight<b.weight then return true elseif a.weight>b.weight then return false else return nil end end
76 
77    local nLeafs = #leafs 
78 
79    -- create tree 
80    local huff = {} 
81    --While there is more than one node in the queues: 
82    local l,h, li, hi, leaf1, leaf2 
83    local newNode; 
84    while (#leafs+#huff > 1) do 
85        -- Dequeue the two nodes with the lowest weight. 
86        -- Dequeue first 
87        if not next(huff) then 
88            li, leaf1 = next(leafs) 
89            table.remove(leafs, li) 
90        elseif not next(leafs) then 
91            hi, leaf1 = next(huff) 
92            table.remove(huff, hi) 
93        else 
94            li, l = next(leafs); 
95            hi, h = next(huff); 
96            if l.weight<=h.weight then 
97                leaf1 = l; 
98                table.remove(leafs, li) 
99            else 
100                leaf1 = h; 
101                table.remove(huff, hi) 
102            end 
103        end 
104        -- Dequeue second 
105        if not next(huff) then 
106            li, leaf2 = next(leafs) 
107            table.remove(leafs, li) 
108        elseif not next(leafs) then 
109            hi, leaf2 = next(huff) 
110            table.remove(huff, hi) 
111        else 
112            li, l = next(leafs); 
113            hi, h = next(huff); 
114            if l.weight<=h.weight then 
115                leaf2 = l; 
116                table.remove(leafs, li) 
117            else 
118                leaf2 = h; 
119                table.remove(huff, hi) 
120            end 
121        end 
122 
123        --Create a new internal node, with the two just-removed nodes as children (either node can be either child) and the sum of their weights as the new weight. 
124        newNode = { c1 = leaf1, c2 = leaf2, weight = leaf1.weight+leaf2.weight } 
125        table.insert(huff,newNode) 
126    end 
127    if #leafs>0 then 
128        li, l = next(leafs) 
129        table.insert(huff, l) 
130        table.remove(leafs, li) 
131    end 
132    huff = huff[1]; 
133 
134    -- assign codes to each symbol 
135    -- c1 = "0", c2 = "1" 
136    -- As a common convention, bit '0' represents following the left child and bit '1' represents following the right child. 
137    -- c1 = left, c2 = right 
138 
139    addCode(huff,0,0); 
140    if huff then 
141        huff.bcode = 0 
142        huff.blength = 1 
143    end 
144 
145    -- READING 
146    -- bitfield = 0 
147    -- bitfield_len = 0 
148    -- read byte1 
149    -- bitfield = bitfield + bit.lshift(byte1, bitfield_len) 
150    -- bitfield_len = bitfield_len + 8 
151    -- read byte2 
152    -- bitfield = bitfield + bit.lshift(byte2, bitfield_len) 
153    -- bitfield_len = bitfield_len + 8 
154    -- (use 5 bits) 
155    -- word = bit.band( bitfield, bit.lshift(1,5)-1) 
156    -- bitfield = bit.rshift( bitfield, 5) 
157    -- bitfield_len = bitfield_len - 5 
158    -- read byte3 
159    -- bitfield = bitfield + bit.lshift(byte3, bitfield_len) 
160    -- bitfield_len = bitfield_len + 8 
161 
162    -- WRITING 
163    remainder = 0
164    remainder_length = 0
165 
166    local compressed = {} 
167 
168    -- Header: byte 0=#leafs, byte 1-3=size of uncompressed data 
169    -- max 2^24 bytes 
170    local l = string.len(uncompressed) 
171    local c0 = bit.band(l, 255);           -- bit 0-7 
172    local c1 = bit.band(bit.rshift(l, 8), 255);    -- bit 8-15 
173    local c2 = bit.band(bit.rshift(l, 16), 255);   -- bit 16-23 
174    table.insert(compressed, string.char(1)) -- first byte is version info. 0 = uncompressed, 1 = 8-bit word huffman compressed 
175    table.insert(compressed, string.char(bit.band(nLeafs-1, 255))) -- number of leafs 
176    table.insert(compressed, string.char(c0)) 
177    table.insert(compressed, string.char(c1)) 
178    table.insert(compressed, string.char(c2)) 
179 
180    -- create symbol/code map 
181    for symbol, leaf in pairs(symbols) do 
182        addBits(compressed, symbol, 8); 
183        addBits(compressed, escape_code(leaf.bcode, leaf.blength)) 
184        addBits(compressed, 3, 2); 
185    end 
186 
187    -- create huffman code 
188    for i=1, l do 
189        c = string.byte(uncompressed,i) 
190        addBits(compressed, symbols[c].bcode, symbols[c].blength) 
191    end 
192 
193    -- add remainding bits (if any) 
194    if remainder_length>0 then 
195        table.insert(compressed, string.char(remainder)) 
196    end 
197 
198    -- is compression worth it? If not, return uncompressed data. 
199    if (#uncompressed+1) <= #compressed then 
200        return string.char(0)..uncompressed 
201    end 
202 
203    return table.concat(compressed); 
204end 
205 
206local function getSymbolAndCode(bitfield, field_len) 
207    local b; 
208    if field_len>=8+2 then 
209        local p=0
210        -- ignore first 8 bits 
211        for i = 8, field_len-1 do 
212            b = bit.band(bitfield, bit.lshift(1,i)) 
213            if not (p==0) and not (b == 0) then 
214                -- found 2 bits set right after each other (stop bits) 
215                return bit.band(bitfield, 255), bit.band( bit.rshift(bitfield, 8), bit.lshift(1, i-8-1)-1 ), i-8-1
216                    bit.rshift(bitfield, i+1), field_len-i-1 
217            end 
218            p = b 
219        end 
220    end 
221    return nil 
222end 
223 
224local function unescape_code(code, code_len) 
225    local unescaped_code=0
226    local b; 
227    local l = 0
228    local i = 0 
229    while i < code_len do 
230        b = bit.band( code, bit.lshift(1,i))==0 and 0 or 1 
231        if b==1 then 
232            unescaped_code = bit.bor(unescaped_code, bit.lshift(1, l)) 
233            i = i+1 
234        end 
235        i = i+1 
236        l = l+1 
237    end 
238    return unescaped_code, l 
239end 
240 
241function Hdecode(compressed) 
242    if not type(uncompressed)=="string" then 
243        return nil, "Can only uncompress strings" 
244    end 
245 
246    local compressed_size = #compressed 
247    --decode header 
248    local info_byte = string.byte(compressed) 
249    -- is data compressed 
250    if info_byte==0 then 
251        return compressed:sub(1) --return uncompressed data 
252    end 
253    if not (info_byte==1) then 
254        return nil, "Cannot uncompress: Unknown version ("..tostring(info_byte)..")" 
255    end 
256 
257    local num_symbols=string.byte(string.sub(compressed,2,2))+1 
258    local c0=string.byte(string.sub(compressed,3,3)) 
259    local c1=string.byte(string.sub(compressed,4,4)) 
260    local c2=string.byte(string.sub(compressed,5,5)) 
261    local orig_size=c2*65536+c1*256+c0 
262    if orig_size==0 then 
263        return ""
264    end 
265 
266    -- decode code->symbal map 
267    local bitfield=0
268    local bitfield_len=0
269    local map={} 
270    local i=6; -- byte 1-5 are header bytes 
271    local c, cl; 
272    local minCodeLen=1000
273    local maxCodeLen=0
274    local n=0
275    while n<num_symbols do 
276        if i>compressed_size then 
277            return nil, "Cannot decode map" 
278        end 
279 
280        c=string.byte(string.sub(compressed,i,i)) 
281        bitfield = bit.bor(bitfield, bit.lshift(c, bitfield_len)) 
282        bitfield_len = bitfield_len + 8 
283 
284        _symbol, _code, _code_len, _bitfield, _bitfield_len = getSymbolAndCode(bitfield, bitfield_len) 
285        if _symbol then 
286            bitfield = _bitfield 
287            bitfield_len = _bitfield_len 
288            c, cl = unescape_code(_code, _code_len) 
289            if not map[cl] then 
290                map[cl]={} 
291            end 
292            map[cl][c]=string.char(_symbol) 
293            if cl<minCodeLen then 
294                minCodeLen=cl 
295            end 
296            if cl>maxCodeLen then 
297                maxCodeLen=cl 
298            end 
299            --print("symbol: "..string.char(_symbol).."  code: "..tobinary(c, cl)) 
300            n=n+1 
301        end 
302        i=i+1 
303    end 
304 
305    -- lookuptable 
306    local filterMask={} 
307    for l=minCodeLen,maxCodeLen do 
308        filterMask[l] = bit.lshift(1, l)-1 
309    end 
310 
311    local uncompressed={} 
312    local test_code 
313    local test_code_len=minCodeLen; 
314    local symbol; 
315    local dec_size=0
316    compressed_size = compressed_size + 1 
317    while true do 
318        if test_code_len<=bitfield_len then 
319            test_code=bit.band( bitfield, filterMask[test_code_len]) 
320            --print(tobinary(test_code, test_code_len)) 
321            symbol=map[test_code_len][test_code] 
322            if symbol then 
323                table.insert(uncompressed, symbol) 
324                dec_size = dec_size + 1 
325                if dec_size>=orig_size then -- checked here for speed reasons 
326                    break
327                end 
328                --print(map[l][test_code]) 
329                bitfield = bit.rshift(bitfield, test_code_len) 
330                bitfield_len = bitfield_len - test_code_len 
331                test_code_len=minCodeLen 
332            else 
333                test_code_len=test_code_len+1 
334                if test_code_len>maxCodeLen then 
335                    return nil, "Decompression error at "..tostring(i).."/"..tostring(#compressed) 
336                end 
337            end 
338        else 
339            c=string.byte(string.sub(compressed,i,i)) 
340            bitfield = bitfield +  bit.lshift(c or 0, bitfield_len) 
341            bitfield_len = bitfield_len + 8 
342            i=i+1 
343            if i>compressed_size then -- checked here for speed reasons 
344                break
345            end 
346        end 
347    end 
348    return table.concat(uncompressed) 
349end 
350 
351---------- TEST CODE BELOW ---------- 
352function tobinary(val,len) 
353    if not val then 
354        return 
355    end 
356    local bin={} 
357    for i=len-1,0,-1 do 
358        table.insert(bin,bit.band(val,bit.lshift(1,i))==0 and "0" or "1"
359    end 
360    return table.concat(bin); 
361end 
362 
363 
364-- 256 different chars 
365teststr={} 
366for i=0,255 do 
367    table.insert(teststr,string.char(i)) 
368end 
369teststr=table.concat(teststr) 
370 
371if true then 
372teststr={} 
373for i=1,20000 do 
374    table.insert(teststr,string.char(math.random(25)-1)) 
375end 
376teststr=table.concat(teststr) 
377end 
378 
379--teststr="In computer science and information theory, Huffman coding is an entropy encoding algorithm used for lossless data compression. The term refers to the use of a variable-length code table for encoding a source symbol (such as a character in a file) where the variable-length code table has been derived in a particular way based on the estimated probability of occurrence for each possible value of the source symbol. It was developed by David A. Huffman while he was a Ph.D. student at MIT, and published in the 1952 paper \"A Method for the Construction of Minimum-Redundancy Codes.\" Huffman became a member of the MIT faculty upon graduation and was later the founding member of the Computer Science Department at the University of California, Santa Cruz."; 
380--teststr="this is an example of a huffman tree"; 
381--teststr="TOBEORNOTTOBEORTOBEORNOT" 
382--teststr="TOBEORNOTTOBEORTOBEORNOTTOBEORNOTTOBEORTOBEORNOTTOBEORNOTTOBEORTOBEORNOTTOBEORNOTTOBEORTOBEORNOT" 
383--teststr=string.rep("abc",20) 
384--teststr="a" 
385--teststr="" 
386 
387t=os.clock() 
388huff=Hencode(teststr) 
389print("encoding time: "..tonumber(os.clock()-t)) 
390 
391t=os.clock() 
392uncompressed=Hdecode(huff) 
393print("decoding time: "..tonumber(os.clock()-t)) 
394 
395print(uncompressed:sub(1,70).."..."
396 
397if teststr == uncompressed then 
398    print("Strings are identical. Compress/decompress works"
399end 
400 
401print("Original size: "..tostring(#teststr)) 
402print("Compressed size: "..tostring(#huff)) 
403print("Compression ratio: "..tostring(#huff/#teststr)) 
404 
Download and save
Toggle line numbers
Thread:
[87899] Huffman compression algorithm by Galmok at 2008-05-15 21:08:18
  [87960] Re: Huffman compression algorithm by Galmok at 2008-05-17 05:54:58 (diff)
  [102035] Re: Huffman compression algorithm by Anonymous at 2008-11-17 01:38:03 (diff)
Tip: Click the line numbers to toggle highliting on that line.

Paste followup:

Language:
Author:
Subject:


    Tabstop:     bigger biggest
Note: You can prefix a line with "@@@" to highlight it.