/* #include "bzlib_private.h" */
+#define mswap(zz1, zz2) \
+{ \
+ int32_t zztmp = zz1; \
+ zz1 = zz2; \
+ zz2 = zztmp; \
+}
+
+static
+/* No measurable speed gain with inlining */
+/* ALWAYS_INLINE */
+void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
+{
+ while (zzn > 0) {
+ mswap(ptr[zzp1], ptr[zzp2]);
+ zzp1++;
+ zzp2++;
+ zzn--;
+ }
+}
+
+static
+ALWAYS_INLINE
+int32_t mmin(int32_t a, int32_t b)
+{
+ return (a < b) ? a : b;
+}
+
+
/*---------------------------------------------*/
/*--- Fallback O(N log(N)^2) sorting ---*/
/*--- algorithm, for repetitive blocks ---*/
/*---------------------------------------------*/
-#define fswap(zz1, zz2) \
-{ \
- int32_t zztmp = zz1; \
- zz1 = zz2; \
- zz2 = zztmp; \
-}
-
-#define fvswap(zzp1, zzp2, zzn) \
-{ \
- int32_t yyp1 = (zzp1); \
- int32_t yyp2 = (zzp2); \
- int32_t yyn = (zzn); \
- while (yyn > 0) { \
- fswap(fmap[yyp1], fmap[yyp2]); \
- yyp1++; \
- yyp2++; \
- yyn--; \
- } \
-}
-
-
-#define fmin(a,b) ((a) < (b)) ? (a) : (b)
-
#define fpush(lz,hz) { \
stackLo[sp] = lz; \
stackHi[sp] = hz; \
#define FALLBACK_QSORT_SMALL_THRESH 10
#define FALLBACK_QSORT_STACK_SIZE 100
-
static
void fallbackQSort3(uint32_t* fmap,
uint32_t* eclass,
if (unLo > unHi) break;
n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
if (n == 0) {
- fswap(fmap[unLo], fmap[ltLo]);
+ mswap(fmap[unLo], fmap[ltLo]);
ltLo++;
unLo++;
continue;
if (unLo > unHi) break;
n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
if (n == 0) {
- fswap(fmap[unHi], fmap[gtHi]);
+ mswap(fmap[unHi], fmap[gtHi]);
gtHi--; unHi--;
continue;
};
unHi--;
}
if (unLo > unHi) break;
- fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
+ mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
}
AssertD(unHi == unLo-1, "fallbackQSort3(2)");
if (gtHi < ltLo) continue;
- n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
- m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
+ n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
+ m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
n = lo + unLo - ltLo - 1;
m = hi - (gtHi - unHi) + 1;
}
}
-#undef fmin
#undef fpush
#undef fpop
-#undef fswap
-#undef fvswap
#undef FALLBACK_QSORT_SMALL_THRESH
#undef FALLBACK_QSORT_STACK_SIZE
/* Pre:
* nblock > 0
* eclass exists for [0 .. nblock-1]
- * ((UChar*)eclass) [0 .. nblock-1] holds block
+ * ((uint8_t*)eclass) [0 .. nblock-1] holds block
* ptr exists for [0 .. nblock-1]
*
* Post:
- * ((UChar*)eclass) [0 .. nblock-1] holds block
+ * ((uint8_t*)eclass) [0 .. nblock-1] holds block
* All other areas of eclass destroyed
* fmap [0 .. nblock-1] holds sorted order
* bhtab[0 .. 2+(nblock/32)] destroyed
int32_t H, i, j, k, l, r, cc, cc1;
int32_t nNotDone;
int32_t nBhtab;
- UChar* eclass8 = (UChar*)eclass;
+ uint8_t* eclass8 = (uint8_t*)eclass;
/*
* Initial 1-char radix sort to generate
for (i = 0; i < 257; i++) ftab[i] = 0;
for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
- for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
+
+ j = ftab[0]; /* bbox: optimized */
+ for (i = 1; i < 257; i++) {
+ j += ftab[i];
+ ftab[i] = j;
+ }
for (i = 0; i < nblock; i++) {
j = eclass8[i];
fmap[k] = i;
}
- nBhtab = 2 + (nblock / 32);
+ nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
for (i = 0; i < 256; i++) SET_BH(ftab[i]);
r = -1;
while (1) {
- /*-- find the next non-singleton bucket --*/
+ /*-- find the next non-singleton bucket --*/
k = r + 1;
while (ISSET_BH(k) && UNALIGNED_BH(k))
k++;
while (ftabCopy[j] == 0)
j++;
ftabCopy[j]--;
- eclass8[fmap[i]] = (UChar)j;
+ eclass8[fmap[i]] = (uint8_t)j;
}
AssertH(j < 256, 1005);
}
/*---------------------------------------------*/
static
-inline
-Bool mainGtU(
+NOINLINE
+int mainGtU(
uint32_t i1,
uint32_t i2,
- UChar* block,
+ uint8_t* block,
uint16_t* quadrant,
uint32_t nblock,
int32_t* budget)
{
int32_t k;
- UChar c1, c2;
+ uint8_t c1, c2;
uint16_t s1, s2;
-///unrolling
- AssertD(i1 != i2, "mainGtU");
- /* 1 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 2 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 3 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 4 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 5 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 6 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 7 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 8 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 9 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 10 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 11 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
- /* 12 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- i1++; i2++;
+/* Loop unrolling here is actually very useful
+ * (generated code is much simpler),
+ * code size increase is only 270 bytes (i386)
+ * but speeds up compression 10% overall
+ */
- k = nblock + 8;
+#if CONFIG_BZIP2_FEATURE_SPEED >= 1
-///unrolling
- do {
- /* 1 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 2 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 3 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 4 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 5 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 6 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 7 */
- c1 = block[i1]; c2 = block[i2];
- if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
- i1++; i2++;
- /* 8 */
+#define TIMES_8(code) \
+ code; code; code; code; \
+ code; code; code; code;
+#define TIMES_12(code) \
+ code; code; code; code; \
+ code; code; code; code; \
+ code; code; code; code;
+
+#else
+
+#define TIMES_8(code) \
+{ \
+ int nn = 8; \
+ do { \
+ code; \
+ } while (--nn); \
+}
+#define TIMES_12(code) \
+{ \
+ int nn = 12; \
+ do { \
+ code; \
+ } while (--nn); \
+}
+
+#endif
+
+ AssertD(i1 != i2, "mainGtU");
+ TIMES_12(
c1 = block[i1]; c2 = block[i2];
if (c1 != c2) return (c1 > c2);
- s1 = quadrant[i1]; s2 = quadrant[i2];
- if (s1 != s2) return (s1 > s2);
i1++; i2++;
+ )
+
+ k = nblock + 8;
+
+ do {
+ TIMES_8(
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ )
if (i1 >= nblock) i1 -= nblock;
if (i2 >= nblock) i2 -= nblock;
- k -= 8;
(*budget)--;
+ k -= 8;
} while (k >= 0);
return False;
}
-
+#undef TIMES_8
+#undef TIMES_12
/*---------------------------------------------*/
/*
static
void mainSimpleSort(uint32_t* ptr,
- UChar* block,
+ uint8_t* block,
uint16_t* quadrant,
int32_t nblock,
int32_t lo,
i = lo + h;
while (1) {
-
-///unrolling
/*-- copy 1 --*/
if (i > hi) break;
v = ptr[i];
ptr[j] = v;
i++;
+/* 1.5% overall speedup, +290 bytes */
+#if CONFIG_BZIP2_FEATURE_SPEED >= 3
/*-- copy 2 --*/
if (i > hi) break;
v = ptr[i];
if (i > hi) break;
v = ptr[i];
j = i;
- while (mainGtU (ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
+ while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
ptr[j] = ptr[j-h];
j = j - h;
if (j <= (lo + h - 1)) break;
}
ptr[j] = v;
i++;
-
+#endif
if (*budget < 0) return;
}
}
* Sedgewick and Jon L. Bentley.
*/
-#define mswap(zz1, zz2) \
-{ \
- int32_t zztmp = zz1; \
- zz1 = zz2; \
- zz2 = zztmp; \
-}
-
-#define mvswap(zzp1, zzp2, zzn) \
-{ \
- int32_t yyp1 = (zzp1); \
- int32_t yyp2 = (zzp2); \
- int32_t yyn = (zzn); \
- while (yyn > 0) { \
- mswap(ptr[yyp1], ptr[yyp2]); \
- yyp1++; \
- yyp2++; \
- yyn--; \
- } \
-}
-
static
-inline
-UChar mmed3(UChar a, UChar b, UChar c)
+ALWAYS_INLINE
+uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
{
- UChar t;
+ uint8_t t;
if (a > b) {
t = a;
a = b;
b = t;
};
+ /* here b >= a */
if (b > c) {
b = c;
if (a > b)
return b;
}
-#define mmin(a,b) ((a) < (b)) ? (a) : (b)
-
#define mpush(lz,hz,dz) \
{ \
stackLo[sp] = lz; \
dz = stackD [sp]; \
}
-
-#define mnextsize(az) (nextHi[az]-nextLo[az])
+#define mnextsize(az) (nextHi[az] - nextLo[az])
#define mnextswap(az,bz) \
{ \
static
void mainQSort3(uint32_t* ptr,
- UChar* block,
+ uint8_t* block,
uint16_t* quadrant,
int32_t nblock,
int32_t loSt,
return;
continue;
}
-
med = (int32_t) mmed3(block[ptr[lo ] + d],
block[ptr[hi ] + d],
block[ptr[(lo+hi) >> 1] + d]);
continue;
}
- n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
- m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
+ n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
+ m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
n = lo + unLo - ltLo - 1;
m = hi - (gtHi - unHi) + 1;
nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
- if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
- if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
- if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
+ if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
- mpush (nextLo[0], nextHi[0], nextD[0]);
- mpush (nextLo[1], nextHi[1], nextD[1]);
- mpush (nextLo[2], nextHi[2], nextD[2]);
+ mpush(nextLo[0], nextHi[0], nextD[0]);
+ mpush(nextLo[1], nextHi[1], nextD[1]);
+ mpush(nextLo[2], nextHi[2], nextD[2]);
}
}
-#undef mswap
-#undef mvswap
#undef mpush
#undef mpop
-#undef mmin
#undef mnextsize
#undef mnextswap
#undef MAIN_QSORT_SMALL_THRESH
/* Pre:
* nblock > N_OVERSHOOT
* block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
- * ((UChar*)block32) [0 .. nblock-1] holds block
+ * ((uint8_t*)block32) [0 .. nblock-1] holds block
* ptr exists for [0 .. nblock-1]
*
* Post:
- * ((UChar*)block32) [0 .. nblock-1] holds block
+ * ((uint8_t*)block32) [0 .. nblock-1] holds block
* All other areas of block32 destroyed
* ftab[0 .. 65536] destroyed
* ptr [0 .. nblock-1] holds sorted order
static NOINLINE
void mainSort(uint32_t* ptr,
- UChar* block,
+ uint8_t* block,
uint16_t* quadrant,
uint32_t* ftab,
int32_t nblock,
{
int32_t i, j, k, ss, sb;
int32_t runningOrder[256];
- Bool bigDone[256];
+ Bool bigDone[256];
int32_t copyStart[256];
int32_t copyEnd [256];
- UChar c1;
+ uint8_t c1;
int32_t numQSorted;
uint16_t s;
memset(ftab, 0, 65537 * sizeof(ftab[0]));
j = block[0] << 8;
- i = nblock-1;
-#if 0
+ i = nblock - 1;
+/* 3%, +300 bytes */
+#if CONFIG_BZIP2_FEATURE_SPEED >= 2
for (; i >= 3; i -= 4) {
quadrant[i] = 0;
- j = (j >> 8) |(((uint16_t)block[i]) << 8);
+ j = (j >> 8) | (((uint16_t)block[i]) << 8);
ftab[j]++;
quadrant[i-1] = 0;
- j = (j >> 8) |(((uint16_t)block[i-1]) << 8);
+ j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
ftab[j]++;
quadrant[i-2] = 0;
- j = (j >> 8) |(((uint16_t)block[i-2]) << 8);
+ j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
ftab[j]++;
quadrant[i-3] = 0;
- j = (j >> 8) |(((uint16_t)block[i-3]) << 8);
+ j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
ftab[j]++;
}
#endif
for (; i >= 0; i--) {
quadrant[i] = 0;
- j = (j >> 8) |(((uint16_t)block[i]) << 8);
+ j = (j >> 8) | (((uint16_t)block[i]) << 8);
ftab[j]++;
}
}
/*-- Complete the initial radix sort --*/
- for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
+ j = ftab[0]; /* bbox: optimized */
+ for (i = 1; i <= 65536; i++) {
+ j += ftab[i];
+ ftab[i] = j;
+ }
s = block[0] << 8;
- i = nblock-1;
-#if 0
+ i = nblock - 1;
+#if CONFIG_BZIP2_FEATURE_SPEED >= 2
for (; i >= 3; i -= 4) {
s = (s >> 8) | (block[i] << 8);
- j = ftab[s] -1;
+ j = ftab[s] - 1;
ftab[s] = j;
ptr[j] = i;
s = (s >> 8) | (block[i-1] << 8);
- j = ftab[s] -1;
+ j = ftab[s] - 1;
ftab[s] = j;
ptr[j] = i-1;
s = (s >> 8) | (block[i-2] << 8);
- j = ftab[s] -1;
+ j = ftab[s] - 1;
ftab[s] = j;
ptr[j] = i-2;
s = (s >> 8) | (block[i-3] << 8);
- j = ftab[s] -1;
+ j = ftab[s] - 1;
ftab[s] = j;
ptr[j] = i-3;
}
#endif
for (; i >= 0; i--) {
s = (s >> 8) | (block[i] << 8);
- j = ftab[s] -1;
+ j = ftab[s] - 1;
ftab[s] = j;
ptr[j] = i;
}
{
int32_t vv;
- /* was: int32_t h = 1; */
+ /* bbox: was: int32_t h = 1; */
/* do h = 3 * h + 1; while (h <= 256); */
- int32_t h = 364;
+ uint32_t h = 364;
do {
- h = h / 3;
+ /*h = h / 3;*/
+ h = (h * 171) >> 9; /* bbox: fast h/3 */
for (i = h; i <= 255; i++) {
vv = runningOrder[i];
j = i;
while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
runningOrder[j] = runningOrder[j-h];
j = j - h;
- if (j <= (h - 1)) goto zero;
+ if (j <= (h - 1))
+ goto zero;
}
- zero:
+ zero:
runningOrder[j] = vv;
}
} while (h != 1);
if (j != ss) {
sb = (ss << 8) + j;
if (!(ftab[sb] & SETMASK)) {
- int32_t lo = ftab[sb] & CLEARMASK;
+ int32_t lo = ftab[sb] & CLEARMASK;
int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
if (hi > lo) {
- mainQSort3 (
+ mainQSort3(
ptr, block, quadrant, nblock,
lo, hi, BZ_N_RADIX, budget
);
- numQSorted += (hi - lo + 1);
if (*budget < 0) return;
+ numQSorted += (hi - lo + 1);
}
}
ftab[sb] |= SETMASK;
}
}
- AssertH((copyStart[ss]-1 == copyEnd[ss])
- ||
- /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
- * Necessity for this case is demonstrated by compressing
- * a sequence of approximately 48.5 million of character
- * 251; 1.0.0/1.0.1 will then die here. */
- (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
- 1007)
+ /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
+ * Necessity for this case is demonstrated by compressing
+ * a sequence of approximately 48.5 million of character
+ * 251; 1.0.0/1.0.1 will then die here. */
+ AssertH((copyStart[ss]-1 == copyEnd[ss]) \
+ || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
for (j = 0; j <= 255; j++)
ftab[(j << 8) + ss] |= SETMASK;
while ((bbSize >> shifts) > 65534) shifts++;
for (j = bbSize-1; j >= 0; j--) {
- int32_t a2update = ptr[bbStart + j];
- uint16_t qVal = (uint16_t)(j >> shifts);
+ int32_t a2update = ptr[bbStart + j];
+ uint16_t qVal = (uint16_t)(j >> shifts);
quadrant[a2update] = qVal;
if (a2update < BZ_N_OVERSHOOT)
quadrant[a2update + nblock] = qVal;
}
AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
}
-
}
}
/* Pre:
* nblock > 0
* arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
- * ((UChar*)arr2)[0 .. nblock-1] holds block
+ * ((uint8_t*)arr2)[0 .. nblock-1] holds block
* arr1 exists for [0 .. nblock-1]
*
* Post:
- * ((UChar*)arr2) [0 .. nblock-1] holds block
+ * ((uint8_t*)arr2) [0 .. nblock-1] holds block
* All other areas of block destroyed
* ftab[0 .. 65536] destroyed
* arr1[0 .. nblock-1] holds sorted order
void BZ2_blockSort(EState* s)
{
/* In original bzip2 1.0.4, it's a parameter, but 30
- * should work ok. */
+ * (which was the default) should work ok. */
enum { wfact = 30 };
uint32_t* ptr = s->ptr;
- UChar* block = s->block;
+ uint8_t* block = s->block;
uint32_t* ftab = s->ftab;
int32_t nblock = s->nblock;
uint16_t* quadrant;
s->origPtr = -1;
for (i = 0; i < s->nblock; i++)
if (ptr[i] == 0) {
- s->origPtr = i; break;
+ s->origPtr = i;
+ break;
};
AssertH(s->origPtr != -1, 1003);