bzip2: size reduction, to just below 9k.
[people/mcb30/busybox.git] / archival / bz / blocksort.c
1 /*
2  * bzip2 is written by Julian Seward <jseward@bzip.org>.
3  * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4  * See README and LICENSE files in this directory for more information.
5  */
6
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery                               ---*/
9 /*---                                           blocksort.c ---*/
10 /*-------------------------------------------------------------*/
11
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
15
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
20 README file.
21
22 This program is released under the terms of the license contained
23 in the file LICENSE.
24 ------------------------------------------------------------------ */
25
26 /* #include "bzlib_private.h" */
27
28 #define mswap(zz1, zz2) \
29 { \
30         int32_t zztmp = zz1; \
31         zz1 = zz2; \
32         zz2 = zztmp; \
33 }
34
35 static
36 /* No measurable speed gain with inlining */
37 /* ALWAYS_INLINE */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39 {
40         while (zzn > 0) {
41                 mswap(ptr[zzp1], ptr[zzp2]);
42                 zzp1++;
43                 zzp2++;
44                 zzn--;
45         }
46 }
47
48 static
49 ALWAYS_INLINE
50 int32_t mmin(int32_t a, int32_t b)
51 {
52         return (a < b) ? a : b;
53 }
54
55
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting        ---*/
58 /*--- algorithm, for repetitive blocks      ---*/
59 /*---------------------------------------------*/
60
61 /*---------------------------------------------*/
62 static
63 inline
64 void fallbackSimpleSort(uint32_t* fmap,
65                 uint32_t* eclass,
66                 int32_t   lo,
67                 int32_t   hi)
68 {
69         int32_t i, j, tmp;
70         uint32_t ec_tmp;
71
72         if (lo == hi) return;
73
74         if (hi - lo > 3) {
75                 for (i = hi-4; i >= lo; i--) {
76                         tmp = fmap[i];
77                         ec_tmp = eclass[tmp];
78                         for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79                                 fmap[j-4] = fmap[j];
80                         fmap[j-4] = tmp;
81                 }
82         }
83
84         for (i = hi-1; i >= lo; i--) {
85                 tmp = fmap[i];
86                 ec_tmp = eclass[tmp];
87                 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88                         fmap[j-1] = fmap[j];
89                 fmap[j-1] = tmp;
90         }
91 }
92
93
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
96         stackLo[sp] = lz; \
97         stackHi[sp] = hz; \
98         sp++; \
99 }
100
101 #define fpop(lz,hz) { \
102         sp--; \
103         lz = stackLo[sp]; \
104         hz = stackHi[sp]; \
105 }
106
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE   100
109
110 static
111 void fallbackQSort3(uint32_t* fmap,
112                 uint32_t* eclass,
113                 int32_t   loSt,
114                 int32_t   hiSt)
115 {
116         int32_t unLo, unHi, ltLo, gtHi, n, m;
117         int32_t sp, lo, hi;
118         uint32_t med, r, r3;
119         int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120         int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122         r = 0;
123
124         sp = 0;
125         fpush(loSt, hiSt);
126
127         while (sp > 0) {
128                 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130                 fpop(lo, hi);
131                 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132                         fallbackSimpleSort(fmap, eclass, lo, hi);
133                         continue;
134                 }
135
136                 /* Random partitioning.  Median of 3 sometimes fails to
137                  * avoid bad cases.  Median of 9 seems to help but
138                  * looks rather expensive.  This too seems to work but
139                  * is cheaper.  Guidance for the magic constants
140                  * 7621 and 32768 is taken from Sedgewick's algorithms
141                  * book, chapter 35.
142                  */
143                 r = ((r * 7621) + 1) % 32768;
144                 r3 = r % 3;
145                 if (r3 == 0)
146                         med = eclass[fmap[lo]];
147                 else if (r3 == 1)
148                         med = eclass[fmap[(lo+hi)>>1]];
149                 else
150                         med = eclass[fmap[hi]];
151
152                 unLo = ltLo = lo;
153                 unHi = gtHi = hi;
154
155                 while (1) {
156                         while (1) {
157                                 if (unLo > unHi) break;
158                                 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159                                 if (n == 0) {
160                                         mswap(fmap[unLo], fmap[ltLo]);
161                                         ltLo++;
162                                         unLo++;
163                                         continue;
164                                 };
165                                 if (n > 0) break;
166                                 unLo++;
167                         }
168                         while (1) {
169                                 if (unLo > unHi) break;
170                                 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171                                 if (n == 0) {
172                                         mswap(fmap[unHi], fmap[gtHi]);
173                                         gtHi--; unHi--;
174                                         continue;
175                                 };
176                                 if (n < 0) break;
177                                 unHi--;
178                         }
179                         if (unLo > unHi) break;
180                         mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
181                 }
182
183                 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185                 if (gtHi < ltLo) continue;
186
187                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
189
190                 n = lo + unLo - ltLo - 1;
191                 m = hi - (gtHi - unHi) + 1;
192
193                 if (n - lo > hi - m) {
194                         fpush(lo, n);
195                         fpush(m, hi);
196                 } else {
197                         fpush(m, hi);
198                         fpush(lo, n);
199                 }
200         }
201 }
202
203 #undef fpush
204 #undef fpop
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
207
208
209 /*---------------------------------------------*/
210 /* Pre:
211  *      nblock > 0
212  *      eclass exists for [0 .. nblock-1]
213  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
214  *      ptr exists for [0 .. nblock-1]
215  *
216  * Post:
217  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
218  *      All other areas of eclass destroyed
219  *      fmap [0 .. nblock-1] holds sorted order
220  *      bhtab[0 .. 2+(nblock/32)] destroyed
221 */
222
223 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
228
229 static
230 void fallbackSort(uint32_t* fmap,
231                 uint32_t* eclass,
232                 uint32_t* bhtab,
233                 int32_t   nblock)
234 {
235         int32_t ftab[257];
236         int32_t ftabCopy[256];
237         int32_t H, i, j, k, l, r, cc, cc1;
238         int32_t nNotDone;
239         int32_t nBhtab;
240         uint8_t* eclass8 = (uint8_t*)eclass;
241
242         /*
243          * Initial 1-char radix sort to generate
244          * initial fmap and initial BH bits.
245          */
246         for (i = 0; i < 257;    i++) ftab[i] = 0;
247         for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248         for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
249         for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
250
251         for (i = 0; i < nblock; i++) {
252                 j = eclass8[i];
253                 k = ftab[j] - 1;
254                 ftab[j] = k;
255                 fmap[k] = i;
256         }
257
258         nBhtab = 2 + (nblock / 32);
259         for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
260         for (i = 0; i < 256; i++) SET_BH(ftab[i]);
261
262         /*
263          * Inductively refine the buckets.  Kind-of an
264          * "exponential radix sort" (!), inspired by the
265          * Manber-Myers suffix array construction algorithm.
266          */
267
268         /*-- set sentinel bits for block-end detection --*/
269         for (i = 0; i < 32; i++) {
270                 SET_BH(nblock + 2*i);
271                 CLEAR_BH(nblock + 2*i + 1);
272         }
273
274         /*-- the log(N) loop --*/
275         H = 1;
276         while (1) {
277                 j = 0;
278                 for (i = 0; i < nblock; i++) {
279                         if (ISSET_BH(i))
280                                 j = i;
281                         k = fmap[i] - H;
282                         if (k < 0)
283                                 k += nblock;
284                         eclass[k] = j;
285                 }
286
287                 nNotDone = 0;
288                 r = -1;
289                 while (1) {
290
291                 /*-- find the next non-singleton bucket --*/
292                         k = r + 1;
293                         while (ISSET_BH(k) && UNALIGNED_BH(k))
294                                 k++;
295                         if (ISSET_BH(k)) {
296                                 while (WORD_BH(k) == 0xffffffff) k += 32;
297                                 while (ISSET_BH(k)) k++;
298                         }
299                         l = k - 1;
300                         if (l >= nblock)
301                                 break;
302                         while (!ISSET_BH(k) && UNALIGNED_BH(k))
303                                 k++;
304                         if (!ISSET_BH(k)) {
305                                 while (WORD_BH(k) == 0x00000000) k += 32;
306                                 while (!ISSET_BH(k)) k++;
307                         }
308                         r = k - 1;
309                         if (r >= nblock)
310                                 break;
311
312                         /*-- now [l, r] bracket current bucket --*/
313                         if (r > l) {
314                                 nNotDone += (r - l + 1);
315                                 fallbackQSort3(fmap, eclass, l, r);
316
317                                 /*-- scan bucket and generate header bits-- */
318                                 cc = -1;
319                                 for (i = l; i <= r; i++) {
320                                         cc1 = eclass[fmap[i]];
321                                         if (cc != cc1) {
322                                                 SET_BH(i);
323                                                 cc = cc1;
324                                         };
325                                 }
326                         }
327                 }
328
329                 H *= 2;
330                 if (H > nblock || nNotDone == 0)
331                         break;
332         }
333
334         /*
335          * Reconstruct the original block in
336          * eclass8 [0 .. nblock-1], since the
337          * previous phase destroyed it.
338          */
339         j = 0;
340         for (i = 0; i < nblock; i++) {
341                 while (ftabCopy[j] == 0)
342                         j++;
343                 ftabCopy[j]--;
344                 eclass8[fmap[i]] = (uint8_t)j;
345         }
346         AssertH(j < 256, 1005);
347 }
348
349 #undef       SET_BH
350 #undef     CLEAR_BH
351 #undef     ISSET_BH
352 #undef      WORD_BH
353 #undef UNALIGNED_BH
354
355
356 /*---------------------------------------------*/
357 /*--- The main, O(N^2 log(N)) sorting       ---*/
358 /*--- algorithm.  Faster for "normal"       ---*/
359 /*--- non-repetitive blocks.                ---*/
360 /*---------------------------------------------*/
361
362 /*---------------------------------------------*/
363 static
364 NOINLINE
365 int mainGtU(
366                 uint32_t  i1,
367                 uint32_t  i2,
368                 uint8_t*  block,
369                 uint16_t* quadrant,
370                 uint32_t  nblock,
371                 int32_t*  budget)
372 {
373         int32_t  k;
374         uint8_t  c1, c2;
375         uint16_t s1, s2;
376
377 /* Loop unrolling here is actually very useful
378  * (generated code is much simpler),
379  * code size increase is only 270 bytes (i386)
380  * but speeds up compression 10% overall
381  */
382
383 #if CONFIG_BZIP2_FEATURE_SPEED >= 1
384
385 #define TIMES_8(code) \
386         code; code; code; code; \
387         code; code; code; code;
388 #define TIMES_12(code) \
389         code; code; code; code; \
390         code; code; code; code; \
391         code; code; code; code;
392
393 #else
394
395 #define TIMES_8(code) \
396 { \
397         int nn = 8; \
398         do { \
399                 code; \
400         } while (--nn); \
401 }
402 #define TIMES_12(code) \
403 { \
404         int nn = 12; \
405         do { \
406                 code; \
407         } while (--nn); \
408 }
409
410 #endif
411
412         AssertD(i1 != i2, "mainGtU");
413         TIMES_12(
414                 c1 = block[i1]; c2 = block[i2];
415                 if (c1 != c2) return (c1 > c2);
416                 i1++; i2++;
417         )
418
419         k = nblock + 8;
420
421         do {
422                 TIMES_8(
423                         c1 = block[i1]; c2 = block[i2];
424                         if (c1 != c2) return (c1 > c2);
425                         s1 = quadrant[i1]; s2 = quadrant[i2];
426                         if (s1 != s2) return (s1 > s2);
427                         i1++; i2++;
428                 )
429
430                 if (i1 >= nblock) i1 -= nblock;
431                 if (i2 >= nblock) i2 -= nblock;
432
433                 (*budget)--;
434                 k -= 8;
435         } while (k >= 0);
436
437         return False;
438 }
439 #undef TIMES_8
440 #undef TIMES_12
441
442 /*---------------------------------------------*/
443 /*
444  * Knuth's increments seem to work better
445  * than Incerpi-Sedgewick here.  Possibly
446  * because the number of elems to sort is
447  * usually small, typically <= 20.
448  */
449 static
450 const int32_t incs[14] = {
451         1, 4, 13, 40, 121, 364, 1093, 3280,
452         9841, 29524, 88573, 265720,
453         797161, 2391484
454 };
455
456 static
457 void mainSimpleSort(uint32_t* ptr,
458                 uint8_t*  block,
459                 uint16_t* quadrant,
460                 int32_t   nblock,
461                 int32_t   lo,
462                 int32_t   hi,
463                 int32_t   d,
464                 int32_t*  budget)
465 {
466         int32_t i, j, h, bigN, hp;
467         uint32_t v;
468
469         bigN = hi - lo + 1;
470         if (bigN < 2) return;
471
472         hp = 0;
473         while (incs[hp] < bigN) hp++;
474         hp--;
475
476         for (; hp >= 0; hp--) {
477                 h = incs[hp];
478
479                 i = lo + h;
480                 while (1) {
481                         /*-- copy 1 --*/
482                         if (i > hi) break;
483                         v = ptr[i];
484                         j = i;
485                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
486                                 ptr[j] = ptr[j-h];
487                                 j = j - h;
488                                 if (j <= (lo + h - 1)) break;
489                         }
490                         ptr[j] = v;
491                         i++;
492
493 /* 1.5% overall speedup, +290 bytes */
494 #if CONFIG_BZIP2_FEATURE_SPEED >= 3
495                         /*-- copy 2 --*/
496                         if (i > hi) break;
497                         v = ptr[i];
498                         j = i;
499                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
500                                 ptr[j] = ptr[j-h];
501                                 j = j - h;
502                                 if (j <= (lo + h - 1)) break;
503                         }
504                         ptr[j] = v;
505                         i++;
506
507                         /*-- copy 3 --*/
508                         if (i > hi) break;
509                         v = ptr[i];
510                         j = i;
511                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
512                                 ptr[j] = ptr[j-h];
513                                 j = j - h;
514                                 if (j <= (lo + h - 1)) break;
515                         }
516                         ptr[j] = v;
517                         i++;
518 #endif
519                         if (*budget < 0) return;
520                 }
521         }
522 }
523
524
525 /*---------------------------------------------*/
526 /*
527  * The following is an implementation of
528  * an elegant 3-way quicksort for strings,
529  * described in a paper "Fast Algorithms for
530  * Sorting and Searching Strings", by Robert
531  * Sedgewick and Jon L. Bentley.
532  */
533
534 static
535 ALWAYS_INLINE
536 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
537 {
538         uint8_t t;
539         if (a > b) {
540                 t = a;
541                 a = b;
542                 b = t;
543         };
544         /* here b >= a */
545         if (b > c) {
546                 b = c;
547                 if (a > b)
548                         b = a;
549         }
550         return b;
551 }
552
553 #define mpush(lz,hz,dz) \
554 { \
555         stackLo[sp] = lz; \
556         stackHi[sp] = hz; \
557         stackD [sp] = dz; \
558         sp++; \
559 }
560
561 #define mpop(lz,hz,dz) \
562 { \
563         sp--; \
564         lz = stackLo[sp]; \
565         hz = stackHi[sp]; \
566         dz = stackD [sp]; \
567 }
568
569 #define mnextsize(az) (nextHi[az] - nextLo[az])
570
571 #define mnextswap(az,bz) \
572 { \
573         int32_t tz; \
574         tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
575         tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
576         tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
577 }
578
579 #define MAIN_QSORT_SMALL_THRESH 20
580 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
581 #define MAIN_QSORT_STACK_SIZE   100
582
583 static
584 void mainQSort3(uint32_t* ptr,
585                 uint8_t*  block,
586                 uint16_t* quadrant,
587                 int32_t   nblock,
588                 int32_t   loSt,
589                 int32_t   hiSt,
590                 int32_t   dSt,
591                 int32_t*  budget)
592 {
593         int32_t unLo, unHi, ltLo, gtHi, n, m, med;
594         int32_t sp, lo, hi, d;
595
596         int32_t stackLo[MAIN_QSORT_STACK_SIZE];
597         int32_t stackHi[MAIN_QSORT_STACK_SIZE];
598         int32_t stackD [MAIN_QSORT_STACK_SIZE];
599
600         int32_t nextLo[3];
601         int32_t nextHi[3];
602         int32_t nextD [3];
603
604         sp = 0;
605         mpush(loSt, hiSt, dSt);
606
607         while (sp > 0) {
608                 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
609
610                 mpop(lo, hi, d);
611                 if (hi - lo < MAIN_QSORT_SMALL_THRESH
612                  || d > MAIN_QSORT_DEPTH_THRESH
613                 ) {
614                         mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
615                         if (*budget < 0)
616                                 return;
617                         continue;
618                 }
619                 med = (int32_t) mmed3(block[ptr[lo          ] + d],
620                                       block[ptr[hi          ] + d],
621                                       block[ptr[(lo+hi) >> 1] + d]);
622
623                 unLo = ltLo = lo;
624                 unHi = gtHi = hi;
625
626                 while (1) {
627                         while (1) {
628                                 if (unLo > unHi)
629                                         break;
630                                 n = ((int32_t)block[ptr[unLo]+d]) - med;
631                                 if (n == 0) {
632                                         mswap(ptr[unLo], ptr[ltLo]);
633                                         ltLo++;
634                                         unLo++;
635                                         continue;
636                                 };
637                                 if (n >  0) break;
638                                 unLo++;
639                         }
640                         while (1) {
641                                 if (unLo > unHi)
642                                         break;
643                                 n = ((int32_t)block[ptr[unHi]+d]) - med;
644                                 if (n == 0) {
645                                         mswap(ptr[unHi], ptr[gtHi]);
646                                         gtHi--;
647                                         unHi--;
648                                         continue;
649                                 };
650                                 if (n <  0) break;
651                                 unHi--;
652                         }
653                         if (unLo > unHi)
654                                 break;
655                         mswap(ptr[unLo], ptr[unHi]);
656                         unLo++;
657                         unHi--;
658                 }
659
660                 AssertD(unHi == unLo-1, "mainQSort3(2)");
661
662                 if (gtHi < ltLo) {
663                         mpush(lo, hi, d + 1);
664                         continue;
665                 }
666
667                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
668                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
669
670                 n = lo + unLo - ltLo - 1;
671                 m = hi - (gtHi - unHi) + 1;
672
673                 nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
674                 nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
675                 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
676
677                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
678                 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
679                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
680
681                 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
682                 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
683
684                 mpush(nextLo[0], nextHi[0], nextD[0]);
685                 mpush(nextLo[1], nextHi[1], nextD[1]);
686                 mpush(nextLo[2], nextHi[2], nextD[2]);
687         }
688 }
689
690 #undef mpush
691 #undef mpop
692 #undef mnextsize
693 #undef mnextswap
694 #undef MAIN_QSORT_SMALL_THRESH
695 #undef MAIN_QSORT_DEPTH_THRESH
696 #undef MAIN_QSORT_STACK_SIZE
697
698
699 /*---------------------------------------------*/
700 /* Pre:
701  *      nblock > N_OVERSHOOT
702  *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
703  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
704  *      ptr exists for [0 .. nblock-1]
705  *
706  * Post:
707  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
708  *      All other areas of block32 destroyed
709  *      ftab[0 .. 65536] destroyed
710  *      ptr [0 .. nblock-1] holds sorted order
711  *      if (*budget < 0), sorting was abandoned
712  */
713
714 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
715 #define SETMASK (1 << 21)
716 #define CLEARMASK (~(SETMASK))
717
718 static NOINLINE
719 void mainSort(uint32_t*   ptr,
720                 uint8_t*  block,
721                 uint16_t* quadrant,
722                 uint32_t* ftab,
723                 int32_t   nblock,
724                 int32_t*  budget)
725 {
726         int32_t  i, j, k, ss, sb;
727         int32_t  runningOrder[256];
728         Bool     bigDone[256];
729         int32_t  copyStart[256];
730         int32_t  copyEnd  [256];
731         uint8_t  c1;
732         int32_t  numQSorted;
733         uint16_t s;
734
735         /*-- set up the 2-byte frequency table --*/
736         /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
737         memset(ftab, 0, 65537 * sizeof(ftab[0]));
738
739         j = block[0] << 8;
740         i = nblock-1;
741 /* 3%, +300 bytes */
742 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
743         for (; i >= 3; i -= 4) {
744                 quadrant[i] = 0;
745                 j = (j >> 8) |(((uint16_t)block[i]) << 8);
746                 ftab[j]++;
747                 quadrant[i-1] = 0;
748                 j = (j >> 8) |(((uint16_t)block[i-1]) << 8);
749                 ftab[j]++;
750                 quadrant[i-2] = 0;
751                 j = (j >> 8) |(((uint16_t)block[i-2]) << 8);
752                 ftab[j]++;
753                 quadrant[i-3] = 0;
754                 j = (j >> 8) |(((uint16_t)block[i-3]) << 8);
755                 ftab[j]++;
756         }
757 #endif
758         for (; i >= 0; i--) {
759                 quadrant[i] = 0;
760                 j = (j >> 8) |(((uint16_t)block[i]) << 8);
761                 ftab[j]++;
762         }
763
764         /*-- (emphasises close relationship of block & quadrant) --*/
765         for (i = 0; i < BZ_N_OVERSHOOT; i++) {
766                 block   [nblock+i] = block[i];
767                 quadrant[nblock+i] = 0;
768         }
769
770         /*-- Complete the initial radix sort --*/
771         for (i = 1; i <= 65536; i++)
772                 ftab[i] += ftab[i-1];
773
774         s = block[0] << 8;
775         i = nblock-1;
776 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
777         for (; i >= 3; i -= 4) {
778                 s = (s >> 8) | (block[i] << 8);
779                 j = ftab[s] -1;
780                 ftab[s] = j;
781                 ptr[j] = i;
782                 s = (s >> 8) | (block[i-1] << 8);
783                 j = ftab[s] -1;
784                 ftab[s] = j;
785                 ptr[j] = i-1;
786                 s = (s >> 8) | (block[i-2] << 8);
787                 j = ftab[s] -1;
788                 ftab[s] = j;
789                 ptr[j] = i-2;
790                 s = (s >> 8) | (block[i-3] << 8);
791                 j = ftab[s] -1;
792                 ftab[s] = j;
793                 ptr[j] = i-3;
794         }
795 #endif
796         for (; i >= 0; i--) {
797                 s = (s >> 8) | (block[i] << 8);
798                 j = ftab[s] -1;
799                 ftab[s] = j;
800                 ptr[j] = i;
801         }
802
803         /*
804          * Now ftab contains the first loc of every small bucket.
805          * Calculate the running order, from smallest to largest
806          * big bucket.
807          */
808         for (i = 0; i <= 255; i++) {
809                 bigDone     [i] = False;
810                 runningOrder[i] = i;
811         }
812
813         {
814                 int32_t vv;
815                 /* was: int32_t h = 1; */
816                 /* do h = 3 * h + 1; while (h <= 256); */
817                 int32_t h = 364;
818
819                 do {
820                         h = h / 3;
821                         for (i = h; i <= 255; i++) {
822                                 vv = runningOrder[i];
823                                 j = i;
824                                 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
825                                         runningOrder[j] = runningOrder[j-h];
826                                         j = j - h;
827                                         if (j <= (h - 1)) goto zero;
828                                 }
829                                 zero:
830                                 runningOrder[j] = vv;
831                         }
832                 } while (h != 1);
833         }
834
835         /*
836          * The main sorting loop.
837          */
838
839         numQSorted = 0;
840
841         for (i = 0; i <= 255; i++) {
842
843                 /*
844                  * Process big buckets, starting with the least full.
845                  * Basically this is a 3-step process in which we call
846                  * mainQSort3 to sort the small buckets [ss, j], but
847                  * also make a big effort to avoid the calls if we can.
848                  */
849                 ss = runningOrder[i];
850
851                 /*
852                  * Step 1:
853                  * Complete the big bucket [ss] by quicksorting
854                  * any unsorted small buckets [ss, j], for j != ss.
855                  * Hopefully previous pointer-scanning phases have already
856                  * completed many of the small buckets [ss, j], so
857                  * we don't have to sort them at all.
858                  */
859                 for (j = 0; j <= 255; j++) {
860                         if (j != ss) {
861                                 sb = (ss << 8) + j;
862                                 if (!(ftab[sb] & SETMASK)) {
863                                         int32_t lo = ftab[sb]   & CLEARMASK;
864                                         int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
865                                         if (hi > lo) {
866                                                 mainQSort3 (
867                                                         ptr, block, quadrant, nblock,
868                                                         lo, hi, BZ_N_RADIX, budget
869                                                 );
870                                                 if (*budget < 0) return;
871                                                 numQSorted += (hi - lo + 1);
872                                         }
873                                 }
874                                 ftab[sb] |= SETMASK;
875                         }
876                 }
877
878                 AssertH(!bigDone[ss], 1006);
879
880                 /*
881                  * Step 2:
882                  * Now scan this big bucket [ss] so as to synthesise the
883                  * sorted order for small buckets [t, ss] for all t,
884                  * including, magically, the bucket [ss,ss] too.
885                  * This will avoid doing Real Work in subsequent Step 1's.
886                  */
887                 {
888                         for (j = 0; j <= 255; j++) {
889                                 copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
890                                 copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
891                         }
892                         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
893                                 k = ptr[j] - 1;
894                                 if (k < 0)
895                                         k += nblock;
896                                 c1 = block[k];
897                                 if (!bigDone[c1])
898                                         ptr[copyStart[c1]++] = k;
899                         }
900                         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
901                                 k = ptr[j]-1;
902                                 if (k < 0)
903                                         k += nblock;
904                                 c1 = block[k];
905                                 if (!bigDone[c1])
906                                         ptr[copyEnd[c1]--] = k;
907                         }
908                 }
909
910                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
911                  * Necessity for this case is demonstrated by compressing
912                  * a sequence of approximately 48.5 million of character
913                  * 251; 1.0.0/1.0.1 will then die here. */
914                 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
915                      || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
916
917                 for (j = 0; j <= 255; j++)
918                         ftab[(j << 8) + ss] |= SETMASK;
919
920                 /*
921                  * Step 3:
922                  * The [ss] big bucket is now done.  Record this fact,
923                  * and update the quadrant descriptors.  Remember to
924                  * update quadrants in the overshoot area too, if
925                  * necessary.  The "if (i < 255)" test merely skips
926                  * this updating for the last bucket processed, since
927                  * updating for the last bucket is pointless.
928                  *
929                  * The quadrant array provides a way to incrementally
930                  * cache sort orderings, as they appear, so as to
931                  * make subsequent comparisons in fullGtU() complete
932                  * faster.  For repetitive blocks this makes a big
933                  * difference (but not big enough to be able to avoid
934                  * the fallback sorting mechanism, exponential radix sort).
935                  *
936                  * The precise meaning is: at all times:
937                  *
938                  *      for 0 <= i < nblock and 0 <= j <= nblock
939                  *
940                  *      if block[i] != block[j],
941                  *
942                  *              then the relative values of quadrant[i] and
943                  *                        quadrant[j] are meaningless.
944                  *
945                  *              else {
946                  *                      if quadrant[i] < quadrant[j]
947                  *                              then the string starting at i lexicographically
948                  *                              precedes the string starting at j
949                  *
950                  *                      else if quadrant[i] > quadrant[j]
951                  *                              then the string starting at j lexicographically
952                  *                              precedes the string starting at i
953                  *
954                  *                      else
955                  *                              the relative ordering of the strings starting
956                  *                              at i and j has not yet been determined.
957                  *              }
958                  */
959                 bigDone[ss] = True;
960
961                 if (i < 255) {
962                         int32_t bbStart = ftab[ss << 8] & CLEARMASK;
963                         int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
964                         int32_t shifts  = 0;
965
966                         while ((bbSize >> shifts) > 65534) shifts++;
967
968                         for (j = bbSize-1; j >= 0; j--) {
969                                 int32_t a2update     = ptr[bbStart + j];
970                                 uint16_t qVal        = (uint16_t)(j >> shifts);
971                                 quadrant[a2update] = qVal;
972                                 if (a2update < BZ_N_OVERSHOOT)
973                                         quadrant[a2update + nblock] = qVal;
974                         }
975                         AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
976                 }
977
978         }
979 }
980
981 #undef BIGFREQ
982 #undef SETMASK
983 #undef CLEARMASK
984
985
986 /*---------------------------------------------*/
987 /* Pre:
988  *      nblock > 0
989  *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
990  *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
991  *      arr1 exists for [0 .. nblock-1]
992  *
993  * Post:
994  *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
995  *      All other areas of block destroyed
996  *      ftab[0 .. 65536] destroyed
997  *      arr1[0 .. nblock-1] holds sorted order
998  */
999 static NOINLINE
1000 void BZ2_blockSort(EState* s)
1001 {
1002         /* In original bzip2 1.0.4, it's a parameter, but 30
1003          * (which was the default) should work ok. */
1004         enum { wfact = 30 };
1005
1006         uint32_t* ptr    = s->ptr;
1007         uint8_t*  block  = s->block;
1008         uint32_t* ftab   = s->ftab;
1009         int32_t   nblock = s->nblock;
1010         uint16_t* quadrant;
1011         int32_t   budget;
1012         int32_t   i;
1013
1014         if (nblock < 10000) {
1015                 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1016         } else {
1017                 /* Calculate the location for quadrant, remembering to get
1018                  * the alignment right.  Assumes that &(block[0]) is at least
1019                  * 2-byte aligned -- this should be ok since block is really
1020                  * the first section of arr2.
1021                  */
1022                 i = nblock + BZ_N_OVERSHOOT;
1023                 if (i & 1) i++;
1024                 quadrant = (uint16_t*)(&(block[i]));
1025
1026                 /* (wfact-1) / 3 puts the default-factor-30
1027                  * transition point at very roughly the same place as
1028                  * with v0.1 and v0.9.0.
1029                  * Not that it particularly matters any more, since the
1030                  * resulting compressed stream is now the same regardless
1031                  * of whether or not we use the main sort or fallback sort.
1032                  */
1033                 budget = nblock * ((wfact-1) / 3);
1034
1035                 mainSort(ptr, block, quadrant, ftab, nblock, &budget);
1036                 if (budget < 0) {
1037                         fallbackSort(s->arr1, s->arr2, ftab, nblock);
1038                 }
1039         }
1040
1041         s->origPtr = -1;
1042         for (i = 0; i < s->nblock; i++)
1043                 if (ptr[i] == 0) {
1044                         s->origPtr = i; break;
1045                 };
1046
1047         AssertH(s->origPtr != -1, 1003);
1048 }
1049
1050
1051 /*-------------------------------------------------------------*/
1052 /*--- end                                       blocksort.c ---*/
1053 /*-------------------------------------------------------------*/