2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4 * See README and LICENSE files in this directory for more information.
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery ---*/
9 /*--- blocksort.c ---*/
10 /*-------------------------------------------------------------*/
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
22 This program is released under the terms of the license contained
24 ------------------------------------------------------------------ */
26 /* #include "bzlib_private.h" */
28 #define mswap(zz1, zz2) \
30 int32_t zztmp = zz1; \
36 /* No measurable speed gain with inlining */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
41 mswap(ptr[zzp1], ptr[zzp2]);
50 int32_t mmin(int32_t a, int32_t b)
52 return (a < b) ? a : b;
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting ---*/
58 /*--- algorithm, for repetitive blocks ---*/
59 /*---------------------------------------------*/
61 /*---------------------------------------------*/
64 void fallbackSimpleSort(uint32_t* fmap,
75 for (i = hi-4; i >= lo; i--) {
78 for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
84 for (i = hi-1; i >= lo; i--) {
87 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
101 #define fpop(lz,hz) { \
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE 100
111 void fallbackQSort3(uint32_t* fmap,
116 int32_t unLo, unHi, ltLo, gtHi, n, m;
119 int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120 int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
128 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
131 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132 fallbackSimpleSort(fmap, eclass, lo, hi);
136 /* Random partitioning. Median of 3 sometimes fails to
137 * avoid bad cases. Median of 9 seems to help but
138 * looks rather expensive. This too seems to work but
139 * is cheaper. Guidance for the magic constants
140 * 7621 and 32768 is taken from Sedgewick's algorithms
143 r = ((r * 7621) + 1) % 32768;
146 med = eclass[fmap[lo]];
148 med = eclass[fmap[(lo+hi)>>1]];
150 med = eclass[fmap[hi]];
157 if (unLo > unHi) break;
158 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
160 mswap(fmap[unLo], fmap[ltLo]);
169 if (unLo > unHi) break;
170 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
172 mswap(fmap[unHi], fmap[gtHi]);
179 if (unLo > unHi) break;
180 mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
183 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
185 if (gtHi < ltLo) continue;
187 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
190 n = lo + unLo - ltLo - 1;
191 m = hi - (gtHi - unHi) + 1;
193 if (n - lo > hi - m) {
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
209 /*---------------------------------------------*/
212 * eclass exists for [0 .. nblock-1]
213 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
214 * ptr exists for [0 .. nblock-1]
217 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
218 * All other areas of eclass destroyed
219 * fmap [0 .. nblock-1] holds sorted order
220 * bhtab[0 .. 2+(nblock/32)] destroyed
223 #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define WORD_BH(zz) bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz) ((zz) & 0x01f)
230 void fallbackSort(uint32_t* fmap,
236 int32_t ftabCopy[256];
237 int32_t H, i, j, k, l, r, cc, cc1;
240 uint8_t* eclass8 = (uint8_t*)eclass;
243 * Initial 1-char radix sort to generate
244 * initial fmap and initial BH bits.
246 for (i = 0; i < 257; i++) ftab[i] = 0;
247 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
249 for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
251 for (i = 0; i < nblock; i++) {
258 nBhtab = 2 + (nblock / 32);
259 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
260 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
263 * Inductively refine the buckets. Kind-of an
264 * "exponential radix sort" (!), inspired by the
265 * Manber-Myers suffix array construction algorithm.
268 /*-- set sentinel bits for block-end detection --*/
269 for (i = 0; i < 32; i++) {
270 SET_BH(nblock + 2*i);
271 CLEAR_BH(nblock + 2*i + 1);
274 /*-- the log(N) loop --*/
278 for (i = 0; i < nblock; i++) {
291 /*-- find the next non-singleton bucket --*/
293 while (ISSET_BH(k) && UNALIGNED_BH(k))
296 while (WORD_BH(k) == 0xffffffff) k += 32;
297 while (ISSET_BH(k)) k++;
302 while (!ISSET_BH(k) && UNALIGNED_BH(k))
305 while (WORD_BH(k) == 0x00000000) k += 32;
306 while (!ISSET_BH(k)) k++;
312 /*-- now [l, r] bracket current bucket --*/
314 nNotDone += (r - l + 1);
315 fallbackQSort3(fmap, eclass, l, r);
317 /*-- scan bucket and generate header bits-- */
319 for (i = l; i <= r; i++) {
320 cc1 = eclass[fmap[i]];
330 if (H > nblock || nNotDone == 0)
335 * Reconstruct the original block in
336 * eclass8 [0 .. nblock-1], since the
337 * previous phase destroyed it.
340 for (i = 0; i < nblock; i++) {
341 while (ftabCopy[j] == 0)
344 eclass8[fmap[i]] = (uint8_t)j;
346 AssertH(j < 256, 1005);
356 /*---------------------------------------------*/
357 /*--- The main, O(N^2 log(N)) sorting ---*/
358 /*--- algorithm. Faster for "normal" ---*/
359 /*--- non-repetitive blocks. ---*/
360 /*---------------------------------------------*/
362 /*---------------------------------------------*/
377 /* Loop unrolling here is actually very useful
378 * (generated code is much simpler),
379 * code size increase is only 270 bytes (i386)
380 * but speeds up compression 10% overall
383 #if CONFIG_BZIP2_FEATURE_SPEED >= 1
385 #define TIMES_8(code) \
386 code; code; code; code; \
387 code; code; code; code;
388 #define TIMES_12(code) \
389 code; code; code; code; \
390 code; code; code; code; \
391 code; code; code; code;
395 #define TIMES_8(code) \
402 #define TIMES_12(code) \
412 AssertD(i1 != i2, "mainGtU");
414 c1 = block[i1]; c2 = block[i2];
415 if (c1 != c2) return (c1 > c2);
423 c1 = block[i1]; c2 = block[i2];
424 if (c1 != c2) return (c1 > c2);
425 s1 = quadrant[i1]; s2 = quadrant[i2];
426 if (s1 != s2) return (s1 > s2);
430 if (i1 >= nblock) i1 -= nblock;
431 if (i2 >= nblock) i2 -= nblock;
442 /*---------------------------------------------*/
444 * Knuth's increments seem to work better
445 * than Incerpi-Sedgewick here. Possibly
446 * because the number of elems to sort is
447 * usually small, typically <= 20.
450 const int32_t incs[14] = {
451 1, 4, 13, 40, 121, 364, 1093, 3280,
452 9841, 29524, 88573, 265720,
457 void mainSimpleSort(uint32_t* ptr,
466 int32_t i, j, h, bigN, hp;
470 if (bigN < 2) return;
473 while (incs[hp] < bigN) hp++;
476 for (; hp >= 0; hp--) {
485 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
488 if (j <= (lo + h - 1)) break;
493 /* 1.5% overall speedup, +290 bytes */
494 #if CONFIG_BZIP2_FEATURE_SPEED >= 3
499 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
502 if (j <= (lo + h - 1)) break;
511 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
514 if (j <= (lo + h - 1)) break;
519 if (*budget < 0) return;
525 /*---------------------------------------------*/
527 * The following is an implementation of
528 * an elegant 3-way quicksort for strings,
529 * described in a paper "Fast Algorithms for
530 * Sorting and Searching Strings", by Robert
531 * Sedgewick and Jon L. Bentley.
536 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
553 #define mpush(lz,hz,dz) \
561 #define mpop(lz,hz,dz) \
569 #define mnextsize(az) (nextHi[az] - nextLo[az])
571 #define mnextswap(az,bz) \
574 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
575 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
576 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
579 #define MAIN_QSORT_SMALL_THRESH 20
580 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
581 #define MAIN_QSORT_STACK_SIZE 100
584 void mainQSort3(uint32_t* ptr,
593 int32_t unLo, unHi, ltLo, gtHi, n, m, med;
594 int32_t sp, lo, hi, d;
596 int32_t stackLo[MAIN_QSORT_STACK_SIZE];
597 int32_t stackHi[MAIN_QSORT_STACK_SIZE];
598 int32_t stackD [MAIN_QSORT_STACK_SIZE];
605 mpush(loSt, hiSt, dSt);
608 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
611 if (hi - lo < MAIN_QSORT_SMALL_THRESH
612 || d > MAIN_QSORT_DEPTH_THRESH
614 mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
619 med = (int32_t) mmed3(block[ptr[lo ] + d],
621 block[ptr[(lo+hi) >> 1] + d]);
630 n = ((int32_t)block[ptr[unLo]+d]) - med;
632 mswap(ptr[unLo], ptr[ltLo]);
643 n = ((int32_t)block[ptr[unHi]+d]) - med;
645 mswap(ptr[unHi], ptr[gtHi]);
655 mswap(ptr[unLo], ptr[unHi]);
660 AssertD(unHi == unLo-1, "mainQSort3(2)");
663 mpush(lo, hi, d + 1);
667 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
668 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
670 n = lo + unLo - ltLo - 1;
671 m = hi - (gtHi - unHi) + 1;
673 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
674 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
675 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
677 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
678 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
679 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
681 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
682 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
684 mpush(nextLo[0], nextHi[0], nextD[0]);
685 mpush(nextLo[1], nextHi[1], nextD[1]);
686 mpush(nextLo[2], nextHi[2], nextD[2]);
694 #undef MAIN_QSORT_SMALL_THRESH
695 #undef MAIN_QSORT_DEPTH_THRESH
696 #undef MAIN_QSORT_STACK_SIZE
699 /*---------------------------------------------*/
701 * nblock > N_OVERSHOOT
702 * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
703 * ((uint8_t*)block32) [0 .. nblock-1] holds block
704 * ptr exists for [0 .. nblock-1]
707 * ((uint8_t*)block32) [0 .. nblock-1] holds block
708 * All other areas of block32 destroyed
709 * ftab[0 .. 65536] destroyed
710 * ptr [0 .. nblock-1] holds sorted order
711 * if (*budget < 0), sorting was abandoned
714 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
715 #define SETMASK (1 << 21)
716 #define CLEARMASK (~(SETMASK))
719 void mainSort(uint32_t* ptr,
726 int32_t i, j, k, ss, sb;
727 int32_t runningOrder[256];
729 int32_t copyStart[256];
730 int32_t copyEnd [256];
735 /*-- set up the 2-byte frequency table --*/
736 /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
737 memset(ftab, 0, 65537 * sizeof(ftab[0]));
742 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
743 for (; i >= 3; i -= 4) {
745 j = (j >> 8) |(((uint16_t)block[i]) << 8);
748 j = (j >> 8) |(((uint16_t)block[i-1]) << 8);
751 j = (j >> 8) |(((uint16_t)block[i-2]) << 8);
754 j = (j >> 8) |(((uint16_t)block[i-3]) << 8);
758 for (; i >= 0; i--) {
760 j = (j >> 8) |(((uint16_t)block[i]) << 8);
764 /*-- (emphasises close relationship of block & quadrant) --*/
765 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
766 block [nblock+i] = block[i];
767 quadrant[nblock+i] = 0;
770 /*-- Complete the initial radix sort --*/
771 for (i = 1; i <= 65536; i++)
772 ftab[i] += ftab[i-1];
776 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
777 for (; i >= 3; i -= 4) {
778 s = (s >> 8) | (block[i] << 8);
782 s = (s >> 8) | (block[i-1] << 8);
786 s = (s >> 8) | (block[i-2] << 8);
790 s = (s >> 8) | (block[i-3] << 8);
796 for (; i >= 0; i--) {
797 s = (s >> 8) | (block[i] << 8);
804 * Now ftab contains the first loc of every small bucket.
805 * Calculate the running order, from smallest to largest
808 for (i = 0; i <= 255; i++) {
815 /* was: int32_t h = 1; */
816 /* do h = 3 * h + 1; while (h <= 256); */
821 for (i = h; i <= 255; i++) {
822 vv = runningOrder[i];
824 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
825 runningOrder[j] = runningOrder[j-h];
827 if (j <= (h - 1)) goto zero;
830 runningOrder[j] = vv;
836 * The main sorting loop.
841 for (i = 0; i <= 255; i++) {
844 * Process big buckets, starting with the least full.
845 * Basically this is a 3-step process in which we call
846 * mainQSort3 to sort the small buckets [ss, j], but
847 * also make a big effort to avoid the calls if we can.
849 ss = runningOrder[i];
853 * Complete the big bucket [ss] by quicksorting
854 * any unsorted small buckets [ss, j], for j != ss.
855 * Hopefully previous pointer-scanning phases have already
856 * completed many of the small buckets [ss, j], so
857 * we don't have to sort them at all.
859 for (j = 0; j <= 255; j++) {
862 if (!(ftab[sb] & SETMASK)) {
863 int32_t lo = ftab[sb] & CLEARMASK;
864 int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
867 ptr, block, quadrant, nblock,
868 lo, hi, BZ_N_RADIX, budget
870 if (*budget < 0) return;
871 numQSorted += (hi - lo + 1);
878 AssertH(!bigDone[ss], 1006);
882 * Now scan this big bucket [ss] so as to synthesise the
883 * sorted order for small buckets [t, ss] for all t,
884 * including, magically, the bucket [ss,ss] too.
885 * This will avoid doing Real Work in subsequent Step 1's.
888 for (j = 0; j <= 255; j++) {
889 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
890 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
892 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
898 ptr[copyStart[c1]++] = k;
900 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
906 ptr[copyEnd[c1]--] = k;
910 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
911 * Necessity for this case is demonstrated by compressing
912 * a sequence of approximately 48.5 million of character
913 * 251; 1.0.0/1.0.1 will then die here. */
914 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
915 || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
917 for (j = 0; j <= 255; j++)
918 ftab[(j << 8) + ss] |= SETMASK;
922 * The [ss] big bucket is now done. Record this fact,
923 * and update the quadrant descriptors. Remember to
924 * update quadrants in the overshoot area too, if
925 * necessary. The "if (i < 255)" test merely skips
926 * this updating for the last bucket processed, since
927 * updating for the last bucket is pointless.
929 * The quadrant array provides a way to incrementally
930 * cache sort orderings, as they appear, so as to
931 * make subsequent comparisons in fullGtU() complete
932 * faster. For repetitive blocks this makes a big
933 * difference (but not big enough to be able to avoid
934 * the fallback sorting mechanism, exponential radix sort).
936 * The precise meaning is: at all times:
938 * for 0 <= i < nblock and 0 <= j <= nblock
940 * if block[i] != block[j],
942 * then the relative values of quadrant[i] and
943 * quadrant[j] are meaningless.
946 * if quadrant[i] < quadrant[j]
947 * then the string starting at i lexicographically
948 * precedes the string starting at j
950 * else if quadrant[i] > quadrant[j]
951 * then the string starting at j lexicographically
952 * precedes the string starting at i
955 * the relative ordering of the strings starting
956 * at i and j has not yet been determined.
962 int32_t bbStart = ftab[ss << 8] & CLEARMASK;
963 int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
966 while ((bbSize >> shifts) > 65534) shifts++;
968 for (j = bbSize-1; j >= 0; j--) {
969 int32_t a2update = ptr[bbStart + j];
970 uint16_t qVal = (uint16_t)(j >> shifts);
971 quadrant[a2update] = qVal;
972 if (a2update < BZ_N_OVERSHOOT)
973 quadrant[a2update + nblock] = qVal;
975 AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
986 /*---------------------------------------------*/
989 * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
990 * ((uint8_t*)arr2)[0 .. nblock-1] holds block
991 * arr1 exists for [0 .. nblock-1]
994 * ((uint8_t*)arr2) [0 .. nblock-1] holds block
995 * All other areas of block destroyed
996 * ftab[0 .. 65536] destroyed
997 * arr1[0 .. nblock-1] holds sorted order
1000 void BZ2_blockSort(EState* s)
1002 /* In original bzip2 1.0.4, it's a parameter, but 30
1003 * (which was the default) should work ok. */
1004 enum { wfact = 30 };
1006 uint32_t* ptr = s->ptr;
1007 uint8_t* block = s->block;
1008 uint32_t* ftab = s->ftab;
1009 int32_t nblock = s->nblock;
1014 if (nblock < 10000) {
1015 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1017 /* Calculate the location for quadrant, remembering to get
1018 * the alignment right. Assumes that &(block[0]) is at least
1019 * 2-byte aligned -- this should be ok since block is really
1020 * the first section of arr2.
1022 i = nblock + BZ_N_OVERSHOOT;
1024 quadrant = (uint16_t*)(&(block[i]));
1026 /* (wfact-1) / 3 puts the default-factor-30
1027 * transition point at very roughly the same place as
1028 * with v0.1 and v0.9.0.
1029 * Not that it particularly matters any more, since the
1030 * resulting compressed stream is now the same regardless
1031 * of whether or not we use the main sort or fallback sort.
1033 budget = nblock * ((wfact-1) / 3);
1035 mainSort(ptr, block, quadrant, ftab, nblock, &budget);
1037 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1042 for (i = 0; i < s->nblock; i++)
1044 s->origPtr = i; break;
1047 AssertH(s->origPtr != -1, 1003);
1051 /*-------------------------------------------------------------*/
1052 /*--- end blocksort.c ---*/
1053 /*-------------------------------------------------------------*/