bzip2: eliminate some divisions
[people/mcb30/busybox.git] / archival / bz / blocksort.c
1 /*
2  * bzip2 is written by Julian Seward <jseward@bzip.org>.
3  * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4  * See README and LICENSE files in this directory for more information.
5  */
6
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery                               ---*/
9 /*---                                           blocksort.c ---*/
10 /*-------------------------------------------------------------*/
11
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
15
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
20 README file.
21
22 This program is released under the terms of the license contained
23 in the file LICENSE.
24 ------------------------------------------------------------------ */
25
26 /* #include "bzlib_private.h" */
27
28 #define mswap(zz1, zz2) \
29 { \
30         int32_t zztmp = zz1; \
31         zz1 = zz2; \
32         zz2 = zztmp; \
33 }
34
35 static
36 /* No measurable speed gain with inlining */
37 /* ALWAYS_INLINE */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39 {
40         while (zzn > 0) {
41                 mswap(ptr[zzp1], ptr[zzp2]);
42                 zzp1++;
43                 zzp2++;
44                 zzn--;
45         }
46 }
47
48 static
49 ALWAYS_INLINE
50 int32_t mmin(int32_t a, int32_t b)
51 {
52         return (a < b) ? a : b;
53 }
54
55
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting        ---*/
58 /*--- algorithm, for repetitive blocks      ---*/
59 /*---------------------------------------------*/
60
61 /*---------------------------------------------*/
62 static
63 inline
64 void fallbackSimpleSort(uint32_t* fmap,
65                 uint32_t* eclass,
66                 int32_t   lo,
67                 int32_t   hi)
68 {
69         int32_t i, j, tmp;
70         uint32_t ec_tmp;
71
72         if (lo == hi) return;
73
74         if (hi - lo > 3) {
75                 for (i = hi-4; i >= lo; i--) {
76                         tmp = fmap[i];
77                         ec_tmp = eclass[tmp];
78                         for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79                                 fmap[j-4] = fmap[j];
80                         fmap[j-4] = tmp;
81                 }
82         }
83
84         for (i = hi-1; i >= lo; i--) {
85                 tmp = fmap[i];
86                 ec_tmp = eclass[tmp];
87                 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88                         fmap[j-1] = fmap[j];
89                 fmap[j-1] = tmp;
90         }
91 }
92
93
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
96         stackLo[sp] = lz; \
97         stackHi[sp] = hz; \
98         sp++; \
99 }
100
101 #define fpop(lz,hz) { \
102         sp--; \
103         lz = stackLo[sp]; \
104         hz = stackHi[sp]; \
105 }
106
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE   100
109
110 static
111 void fallbackQSort3(uint32_t* fmap,
112                 uint32_t* eclass,
113                 int32_t   loSt,
114                 int32_t   hiSt)
115 {
116         int32_t unLo, unHi, ltLo, gtHi, n, m;
117         int32_t sp, lo, hi;
118         uint32_t med, r, r3;
119         int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120         int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122         r = 0;
123
124         sp = 0;
125         fpush(loSt, hiSt);
126
127         while (sp > 0) {
128                 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130                 fpop(lo, hi);
131                 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132                         fallbackSimpleSort(fmap, eclass, lo, hi);
133                         continue;
134                 }
135
136                 /* Random partitioning.  Median of 3 sometimes fails to
137                  * avoid bad cases.  Median of 9 seems to help but
138                  * looks rather expensive.  This too seems to work but
139                  * is cheaper.  Guidance for the magic constants
140                  * 7621 and 32768 is taken from Sedgewick's algorithms
141                  * book, chapter 35.
142                  */
143                 r = ((r * 7621) + 1) % 32768;
144                 r3 = r % 3;
145                 if (r3 == 0)
146                         med = eclass[fmap[lo]];
147                 else if (r3 == 1)
148                         med = eclass[fmap[(lo+hi)>>1]];
149                 else
150                         med = eclass[fmap[hi]];
151
152                 unLo = ltLo = lo;
153                 unHi = gtHi = hi;
154
155                 while (1) {
156                         while (1) {
157                                 if (unLo > unHi) break;
158                                 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159                                 if (n == 0) {
160                                         mswap(fmap[unLo], fmap[ltLo]);
161                                         ltLo++;
162                                         unLo++;
163                                         continue;
164                                 };
165                                 if (n > 0) break;
166                                 unLo++;
167                         }
168                         while (1) {
169                                 if (unLo > unHi) break;
170                                 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171                                 if (n == 0) {
172                                         mswap(fmap[unHi], fmap[gtHi]);
173                                         gtHi--; unHi--;
174                                         continue;
175                                 };
176                                 if (n < 0) break;
177                                 unHi--;
178                         }
179                         if (unLo > unHi) break;
180                         mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
181                 }
182
183                 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185                 if (gtHi < ltLo) continue;
186
187                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
189
190                 n = lo + unLo - ltLo - 1;
191                 m = hi - (gtHi - unHi) + 1;
192
193                 if (n - lo > hi - m) {
194                         fpush(lo, n);
195                         fpush(m, hi);
196                 } else {
197                         fpush(m, hi);
198                         fpush(lo, n);
199                 }
200         }
201 }
202
203 #undef fpush
204 #undef fpop
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
207
208
209 /*---------------------------------------------*/
210 /* Pre:
211  *      nblock > 0
212  *      eclass exists for [0 .. nblock-1]
213  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
214  *      ptr exists for [0 .. nblock-1]
215  *
216  * Post:
217  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
218  *      All other areas of eclass destroyed
219  *      fmap [0 .. nblock-1] holds sorted order
220  *      bhtab[0 .. 2+(nblock/32)] destroyed
221 */
222
223 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
228
229 static
230 void fallbackSort(uint32_t* fmap,
231                 uint32_t* eclass,
232                 uint32_t* bhtab,
233                 int32_t   nblock)
234 {
235         int32_t ftab[257];
236         int32_t ftabCopy[256];
237         int32_t H, i, j, k, l, r, cc, cc1;
238         int32_t nNotDone;
239         int32_t nBhtab;
240         uint8_t* eclass8 = (uint8_t*)eclass;
241
242         /*
243          * Initial 1-char radix sort to generate
244          * initial fmap and initial BH bits.
245          */
246         for (i = 0; i < 257;    i++) ftab[i] = 0;
247         for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248         for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
249
250         j = ftab[0];  /* bbox: optimized */
251         for (i = 1; i < 257;    i++) {
252                 j += ftab[i];
253                 ftab[i] = j;
254         }
255
256         for (i = 0; i < nblock; i++) {
257                 j = eclass8[i];
258                 k = ftab[j] - 1;
259                 ftab[j] = k;
260                 fmap[k] = i;
261         }
262
263         nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
264         for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265         for (i = 0; i < 256; i++) SET_BH(ftab[i]);
266
267         /*
268          * Inductively refine the buckets.  Kind-of an
269          * "exponential radix sort" (!), inspired by the
270          * Manber-Myers suffix array construction algorithm.
271          */
272
273         /*-- set sentinel bits for block-end detection --*/
274         for (i = 0; i < 32; i++) {
275                 SET_BH(nblock + 2*i);
276                 CLEAR_BH(nblock + 2*i + 1);
277         }
278
279         /*-- the log(N) loop --*/
280         H = 1;
281         while (1) {
282                 j = 0;
283                 for (i = 0; i < nblock; i++) {
284                         if (ISSET_BH(i))
285                                 j = i;
286                         k = fmap[i] - H;
287                         if (k < 0)
288                                 k += nblock;
289                         eclass[k] = j;
290                 }
291
292                 nNotDone = 0;
293                 r = -1;
294                 while (1) {
295
296                 /*-- find the next non-singleton bucket --*/
297                         k = r + 1;
298                         while (ISSET_BH(k) && UNALIGNED_BH(k))
299                                 k++;
300                         if (ISSET_BH(k)) {
301                                 while (WORD_BH(k) == 0xffffffff) k += 32;
302                                 while (ISSET_BH(k)) k++;
303                         }
304                         l = k - 1;
305                         if (l >= nblock)
306                                 break;
307                         while (!ISSET_BH(k) && UNALIGNED_BH(k))
308                                 k++;
309                         if (!ISSET_BH(k)) {
310                                 while (WORD_BH(k) == 0x00000000) k += 32;
311                                 while (!ISSET_BH(k)) k++;
312                         }
313                         r = k - 1;
314                         if (r >= nblock)
315                                 break;
316
317                         /*-- now [l, r] bracket current bucket --*/
318                         if (r > l) {
319                                 nNotDone += (r - l + 1);
320                                 fallbackQSort3(fmap, eclass, l, r);
321
322                                 /*-- scan bucket and generate header bits-- */
323                                 cc = -1;
324                                 for (i = l; i <= r; i++) {
325                                         cc1 = eclass[fmap[i]];
326                                         if (cc != cc1) {
327                                                 SET_BH(i);
328                                                 cc = cc1;
329                                         };
330                                 }
331                         }
332                 }
333
334                 H *= 2;
335                 if (H > nblock || nNotDone == 0)
336                         break;
337         }
338
339         /*
340          * Reconstruct the original block in
341          * eclass8 [0 .. nblock-1], since the
342          * previous phase destroyed it.
343          */
344         j = 0;
345         for (i = 0; i < nblock; i++) {
346                 while (ftabCopy[j] == 0)
347                         j++;
348                 ftabCopy[j]--;
349                 eclass8[fmap[i]] = (uint8_t)j;
350         }
351         AssertH(j < 256, 1005);
352 }
353
354 #undef       SET_BH
355 #undef     CLEAR_BH
356 #undef     ISSET_BH
357 #undef      WORD_BH
358 #undef UNALIGNED_BH
359
360
361 /*---------------------------------------------*/
362 /*--- The main, O(N^2 log(N)) sorting       ---*/
363 /*--- algorithm.  Faster for "normal"       ---*/
364 /*--- non-repetitive blocks.                ---*/
365 /*---------------------------------------------*/
366
367 /*---------------------------------------------*/
368 static
369 NOINLINE
370 int mainGtU(
371                 uint32_t  i1,
372                 uint32_t  i2,
373                 uint8_t*  block,
374                 uint16_t* quadrant,
375                 uint32_t  nblock,
376                 int32_t*  budget)
377 {
378         int32_t  k;
379         uint8_t  c1, c2;
380         uint16_t s1, s2;
381
382 /* Loop unrolling here is actually very useful
383  * (generated code is much simpler),
384  * code size increase is only 270 bytes (i386)
385  * but speeds up compression 10% overall
386  */
387
388 #if CONFIG_BZIP2_FEATURE_SPEED >= 1
389
390 #define TIMES_8(code) \
391         code; code; code; code; \
392         code; code; code; code;
393 #define TIMES_12(code) \
394         code; code; code; code; \
395         code; code; code; code; \
396         code; code; code; code;
397
398 #else
399
400 #define TIMES_8(code) \
401 { \
402         int nn = 8; \
403         do { \
404                 code; \
405         } while (--nn); \
406 }
407 #define TIMES_12(code) \
408 { \
409         int nn = 12; \
410         do { \
411                 code; \
412         } while (--nn); \
413 }
414
415 #endif
416
417         AssertD(i1 != i2, "mainGtU");
418         TIMES_12(
419                 c1 = block[i1]; c2 = block[i2];
420                 if (c1 != c2) return (c1 > c2);
421                 i1++; i2++;
422         )
423
424         k = nblock + 8;
425
426         do {
427                 TIMES_8(
428                         c1 = block[i1]; c2 = block[i2];
429                         if (c1 != c2) return (c1 > c2);
430                         s1 = quadrant[i1]; s2 = quadrant[i2];
431                         if (s1 != s2) return (s1 > s2);
432                         i1++; i2++;
433                 )
434
435                 if (i1 >= nblock) i1 -= nblock;
436                 if (i2 >= nblock) i2 -= nblock;
437
438                 (*budget)--;
439                 k -= 8;
440         } while (k >= 0);
441
442         return False;
443 }
444 #undef TIMES_8
445 #undef TIMES_12
446
447 /*---------------------------------------------*/
448 /*
449  * Knuth's increments seem to work better
450  * than Incerpi-Sedgewick here.  Possibly
451  * because the number of elems to sort is
452  * usually small, typically <= 20.
453  */
454 static
455 const int32_t incs[14] = {
456         1, 4, 13, 40, 121, 364, 1093, 3280,
457         9841, 29524, 88573, 265720,
458         797161, 2391484
459 };
460
461 static
462 void mainSimpleSort(uint32_t* ptr,
463                 uint8_t*  block,
464                 uint16_t* quadrant,
465                 int32_t   nblock,
466                 int32_t   lo,
467                 int32_t   hi,
468                 int32_t   d,
469                 int32_t*  budget)
470 {
471         int32_t i, j, h, bigN, hp;
472         uint32_t v;
473
474         bigN = hi - lo + 1;
475         if (bigN < 2) return;
476
477         hp = 0;
478         while (incs[hp] < bigN) hp++;
479         hp--;
480
481         for (; hp >= 0; hp--) {
482                 h = incs[hp];
483
484                 i = lo + h;
485                 while (1) {
486                         /*-- copy 1 --*/
487                         if (i > hi) break;
488                         v = ptr[i];
489                         j = i;
490                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
491                                 ptr[j] = ptr[j-h];
492                                 j = j - h;
493                                 if (j <= (lo + h - 1)) break;
494                         }
495                         ptr[j] = v;
496                         i++;
497
498 /* 1.5% overall speedup, +290 bytes */
499 #if CONFIG_BZIP2_FEATURE_SPEED >= 3
500                         /*-- copy 2 --*/
501                         if (i > hi) break;
502                         v = ptr[i];
503                         j = i;
504                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
505                                 ptr[j] = ptr[j-h];
506                                 j = j - h;
507                                 if (j <= (lo + h - 1)) break;
508                         }
509                         ptr[j] = v;
510                         i++;
511
512                         /*-- copy 3 --*/
513                         if (i > hi) break;
514                         v = ptr[i];
515                         j = i;
516                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
517                                 ptr[j] = ptr[j-h];
518                                 j = j - h;
519                                 if (j <= (lo + h - 1)) break;
520                         }
521                         ptr[j] = v;
522                         i++;
523 #endif
524                         if (*budget < 0) return;
525                 }
526         }
527 }
528
529
530 /*---------------------------------------------*/
531 /*
532  * The following is an implementation of
533  * an elegant 3-way quicksort for strings,
534  * described in a paper "Fast Algorithms for
535  * Sorting and Searching Strings", by Robert
536  * Sedgewick and Jon L. Bentley.
537  */
538
539 static
540 ALWAYS_INLINE
541 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
542 {
543         uint8_t t;
544         if (a > b) {
545                 t = a;
546                 a = b;
547                 b = t;
548         };
549         /* here b >= a */
550         if (b > c) {
551                 b = c;
552                 if (a > b)
553                         b = a;
554         }
555         return b;
556 }
557
558 #define mpush(lz,hz,dz) \
559 { \
560         stackLo[sp] = lz; \
561         stackHi[sp] = hz; \
562         stackD [sp] = dz; \
563         sp++; \
564 }
565
566 #define mpop(lz,hz,dz) \
567 { \
568         sp--; \
569         lz = stackLo[sp]; \
570         hz = stackHi[sp]; \
571         dz = stackD [sp]; \
572 }
573
574 #define mnextsize(az) (nextHi[az] - nextLo[az])
575
576 #define mnextswap(az,bz) \
577 { \
578         int32_t tz; \
579         tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580         tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581         tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
582 }
583
584 #define MAIN_QSORT_SMALL_THRESH 20
585 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586 #define MAIN_QSORT_STACK_SIZE   100
587
588 static
589 void mainQSort3(uint32_t* ptr,
590                 uint8_t*  block,
591                 uint16_t* quadrant,
592                 int32_t   nblock,
593                 int32_t   loSt,
594                 int32_t   hiSt,
595                 int32_t   dSt,
596                 int32_t*  budget)
597 {
598         int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599         int32_t sp, lo, hi, d;
600
601         int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602         int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603         int32_t stackD [MAIN_QSORT_STACK_SIZE];
604
605         int32_t nextLo[3];
606         int32_t nextHi[3];
607         int32_t nextD [3];
608
609         sp = 0;
610         mpush(loSt, hiSt, dSt);
611
612         while (sp > 0) {
613                 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
614
615                 mpop(lo, hi, d);
616                 if (hi - lo < MAIN_QSORT_SMALL_THRESH
617                  || d > MAIN_QSORT_DEPTH_THRESH
618                 ) {
619                         mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
620                         if (*budget < 0)
621                                 return;
622                         continue;
623                 }
624                 med = (int32_t) mmed3(block[ptr[lo          ] + d],
625                                       block[ptr[hi          ] + d],
626                                       block[ptr[(lo+hi) >> 1] + d]);
627
628                 unLo = ltLo = lo;
629                 unHi = gtHi = hi;
630
631                 while (1) {
632                         while (1) {
633                                 if (unLo > unHi)
634                                         break;
635                                 n = ((int32_t)block[ptr[unLo]+d]) - med;
636                                 if (n == 0) {
637                                         mswap(ptr[unLo], ptr[ltLo]);
638                                         ltLo++;
639                                         unLo++;
640                                         continue;
641                                 };
642                                 if (n >  0) break;
643                                 unLo++;
644                         }
645                         while (1) {
646                                 if (unLo > unHi)
647                                         break;
648                                 n = ((int32_t)block[ptr[unHi]+d]) - med;
649                                 if (n == 0) {
650                                         mswap(ptr[unHi], ptr[gtHi]);
651                                         gtHi--;
652                                         unHi--;
653                                         continue;
654                                 };
655                                 if (n <  0) break;
656                                 unHi--;
657                         }
658                         if (unLo > unHi)
659                                 break;
660                         mswap(ptr[unLo], ptr[unHi]);
661                         unLo++;
662                         unHi--;
663                 }
664
665                 AssertD(unHi == unLo-1, "mainQSort3(2)");
666
667                 if (gtHi < ltLo) {
668                         mpush(lo, hi, d + 1);
669                         continue;
670                 }
671
672                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
674
675                 n = lo + unLo - ltLo - 1;
676                 m = hi - (gtHi - unHi) + 1;
677
678                 nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
679                 nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
680                 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
681
682                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683                 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
685
686                 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687                 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
688
689                 mpush(nextLo[0], nextHi[0], nextD[0]);
690                 mpush(nextLo[1], nextHi[1], nextD[1]);
691                 mpush(nextLo[2], nextHi[2], nextD[2]);
692         }
693 }
694
695 #undef mpush
696 #undef mpop
697 #undef mnextsize
698 #undef mnextswap
699 #undef MAIN_QSORT_SMALL_THRESH
700 #undef MAIN_QSORT_DEPTH_THRESH
701 #undef MAIN_QSORT_STACK_SIZE
702
703
704 /*---------------------------------------------*/
705 /* Pre:
706  *      nblock > N_OVERSHOOT
707  *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
709  *      ptr exists for [0 .. nblock-1]
710  *
711  * Post:
712  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
713  *      All other areas of block32 destroyed
714  *      ftab[0 .. 65536] destroyed
715  *      ptr [0 .. nblock-1] holds sorted order
716  *      if (*budget < 0), sorting was abandoned
717  */
718
719 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720 #define SETMASK (1 << 21)
721 #define CLEARMASK (~(SETMASK))
722
723 static NOINLINE
724 void mainSort(uint32_t*   ptr,
725                 uint8_t*  block,
726                 uint16_t* quadrant,
727                 uint32_t* ftab,
728                 int32_t   nblock,
729                 int32_t*  budget)
730 {
731         int32_t  i, j, k, ss, sb;
732         int32_t  runningOrder[256];
733         Bool     bigDone[256];
734         int32_t  copyStart[256];
735         int32_t  copyEnd  [256];
736         uint8_t  c1;
737         int32_t  numQSorted;
738         uint16_t s;
739
740         /*-- set up the 2-byte frequency table --*/
741         /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
742         memset(ftab, 0, 65537 * sizeof(ftab[0]));
743
744         j = block[0] << 8;
745         i = nblock - 1;
746 /* 3%, +300 bytes */
747 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
748         for (; i >= 3; i -= 4) {
749                 quadrant[i] = 0;
750                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
751                 ftab[j]++;
752                 quadrant[i-1] = 0;
753                 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
754                 ftab[j]++;
755                 quadrant[i-2] = 0;
756                 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
757                 ftab[j]++;
758                 quadrant[i-3] = 0;
759                 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
760                 ftab[j]++;
761         }
762 #endif
763         for (; i >= 0; i--) {
764                 quadrant[i] = 0;
765                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
766                 ftab[j]++;
767         }
768
769         /*-- (emphasises close relationship of block & quadrant) --*/
770         for (i = 0; i < BZ_N_OVERSHOOT; i++) {
771                 block   [nblock+i] = block[i];
772                 quadrant[nblock+i] = 0;
773         }
774
775         /*-- Complete the initial radix sort --*/
776         j = ftab[0]; /* bbox: optimized */
777         for (i = 1; i <= 65536; i++) {
778                 j += ftab[i];
779                 ftab[i] = j;
780         }
781
782         s = block[0] << 8;
783         i = nblock - 1;
784 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
785         for (; i >= 3; i -= 4) {
786                 s = (s >> 8) | (block[i] << 8);
787                 j = ftab[s] - 1;
788                 ftab[s] = j;
789                 ptr[j] = i;
790                 s = (s >> 8) | (block[i-1] << 8);
791                 j = ftab[s] - 1;
792                 ftab[s] = j;
793                 ptr[j] = i-1;
794                 s = (s >> 8) | (block[i-2] << 8);
795                 j = ftab[s] - 1;
796                 ftab[s] = j;
797                 ptr[j] = i-2;
798                 s = (s >> 8) | (block[i-3] << 8);
799                 j = ftab[s] - 1;
800                 ftab[s] = j;
801                 ptr[j] = i-3;
802         }
803 #endif
804         for (; i >= 0; i--) {
805                 s = (s >> 8) | (block[i] << 8);
806                 j = ftab[s] - 1;
807                 ftab[s] = j;
808                 ptr[j] = i;
809         }
810
811         /*
812          * Now ftab contains the first loc of every small bucket.
813          * Calculate the running order, from smallest to largest
814          * big bucket.
815          */
816         for (i = 0; i <= 255; i++) {
817                 bigDone     [i] = False;
818                 runningOrder[i] = i;
819         }
820
821         {
822                 int32_t vv;
823                 /* bbox: was: int32_t h = 1; */
824                 /* do h = 3 * h + 1; while (h <= 256); */
825                 uint32_t h = 364;
826
827                 do {
828                         /*h = h / 3;*/
829                         h = (h * 171) >> 9; /* bbox: fast h/3 */
830                         for (i = h; i <= 255; i++) {
831                                 vv = runningOrder[i];
832                                 j = i;
833                                 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
834                                         runningOrder[j] = runningOrder[j-h];
835                                         j = j - h;
836                                         if (j <= (h - 1))
837                                                 goto zero;
838                                 }
839  zero:
840                                 runningOrder[j] = vv;
841                         }
842                 } while (h != 1);
843         }
844
845         /*
846          * The main sorting loop.
847          */
848
849         numQSorted = 0;
850
851         for (i = 0; i <= 255; i++) {
852
853                 /*
854                  * Process big buckets, starting with the least full.
855                  * Basically this is a 3-step process in which we call
856                  * mainQSort3 to sort the small buckets [ss, j], but
857                  * also make a big effort to avoid the calls if we can.
858                  */
859                 ss = runningOrder[i];
860
861                 /*
862                  * Step 1:
863                  * Complete the big bucket [ss] by quicksorting
864                  * any unsorted small buckets [ss, j], for j != ss.
865                  * Hopefully previous pointer-scanning phases have already
866                  * completed many of the small buckets [ss, j], so
867                  * we don't have to sort them at all.
868                  */
869                 for (j = 0; j <= 255; j++) {
870                         if (j != ss) {
871                                 sb = (ss << 8) + j;
872                                 if (!(ftab[sb] & SETMASK)) {
873                                         int32_t lo =  ftab[sb]   & CLEARMASK;
874                                         int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
875                                         if (hi > lo) {
876                                                 mainQSort3(
877                                                         ptr, block, quadrant, nblock,
878                                                         lo, hi, BZ_N_RADIX, budget
879                                                 );
880                                                 if (*budget < 0) return;
881                                                 numQSorted += (hi - lo + 1);
882                                         }
883                                 }
884                                 ftab[sb] |= SETMASK;
885                         }
886                 }
887
888                 AssertH(!bigDone[ss], 1006);
889
890                 /*
891                  * Step 2:
892                  * Now scan this big bucket [ss] so as to synthesise the
893                  * sorted order for small buckets [t, ss] for all t,
894                  * including, magically, the bucket [ss,ss] too.
895                  * This will avoid doing Real Work in subsequent Step 1's.
896                  */
897                 {
898                         for (j = 0; j <= 255; j++) {
899                                 copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
900                                 copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
901                         }
902                         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
903                                 k = ptr[j] - 1;
904                                 if (k < 0)
905                                         k += nblock;
906                                 c1 = block[k];
907                                 if (!bigDone[c1])
908                                         ptr[copyStart[c1]++] = k;
909                         }
910                         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
911                                 k = ptr[j]-1;
912                                 if (k < 0)
913                                         k += nblock;
914                                 c1 = block[k];
915                                 if (!bigDone[c1])
916                                         ptr[copyEnd[c1]--] = k;
917                         }
918                 }
919
920                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
921                  * Necessity for this case is demonstrated by compressing
922                  * a sequence of approximately 48.5 million of character
923                  * 251; 1.0.0/1.0.1 will then die here. */
924                 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
925                      || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
926
927                 for (j = 0; j <= 255; j++)
928                         ftab[(j << 8) + ss] |= SETMASK;
929
930                 /*
931                  * Step 3:
932                  * The [ss] big bucket is now done.  Record this fact,
933                  * and update the quadrant descriptors.  Remember to
934                  * update quadrants in the overshoot area too, if
935                  * necessary.  The "if (i < 255)" test merely skips
936                  * this updating for the last bucket processed, since
937                  * updating for the last bucket is pointless.
938                  *
939                  * The quadrant array provides a way to incrementally
940                  * cache sort orderings, as they appear, so as to
941                  * make subsequent comparisons in fullGtU() complete
942                  * faster.  For repetitive blocks this makes a big
943                  * difference (but not big enough to be able to avoid
944                  * the fallback sorting mechanism, exponential radix sort).
945                  *
946                  * The precise meaning is: at all times:
947                  *
948                  *      for 0 <= i < nblock and 0 <= j <= nblock
949                  *
950                  *      if block[i] != block[j],
951                  *
952                  *              then the relative values of quadrant[i] and
953                  *                        quadrant[j] are meaningless.
954                  *
955                  *              else {
956                  *                      if quadrant[i] < quadrant[j]
957                  *                              then the string starting at i lexicographically
958                  *                              precedes the string starting at j
959                  *
960                  *                      else if quadrant[i] > quadrant[j]
961                  *                              then the string starting at j lexicographically
962                  *                              precedes the string starting at i
963                  *
964                  *                      else
965                  *                              the relative ordering of the strings starting
966                  *                              at i and j has not yet been determined.
967                  *              }
968                  */
969                 bigDone[ss] = True;
970
971                 if (i < 255) {
972                         int32_t bbStart = ftab[ss << 8] & CLEARMASK;
973                         int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
974                         int32_t shifts  = 0;
975
976                         while ((bbSize >> shifts) > 65534) shifts++;
977
978                         for (j = bbSize-1; j >= 0; j--) {
979                                 int32_t a2update   = ptr[bbStart + j];
980                                 uint16_t qVal      = (uint16_t)(j >> shifts);
981                                 quadrant[a2update] = qVal;
982                                 if (a2update < BZ_N_OVERSHOOT)
983                                         quadrant[a2update + nblock] = qVal;
984                         }
985                         AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
986                 }
987         }
988 }
989
990 #undef BIGFREQ
991 #undef SETMASK
992 #undef CLEARMASK
993
994
995 /*---------------------------------------------*/
996 /* Pre:
997  *      nblock > 0
998  *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
999  *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
1000  *      arr1 exists for [0 .. nblock-1]
1001  *
1002  * Post:
1003  *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
1004  *      All other areas of block destroyed
1005  *      ftab[0 .. 65536] destroyed
1006  *      arr1[0 .. nblock-1] holds sorted order
1007  */
1008 static NOINLINE
1009 void BZ2_blockSort(EState* s)
1010 {
1011         /* In original bzip2 1.0.4, it's a parameter, but 30
1012          * (which was the default) should work ok. */
1013         enum { wfact = 30 };
1014
1015         uint32_t* ptr    = s->ptr;
1016         uint8_t*  block  = s->block;
1017         uint32_t* ftab   = s->ftab;
1018         int32_t   nblock = s->nblock;
1019         uint16_t* quadrant;
1020         int32_t   budget;
1021         int32_t   i;
1022
1023         if (nblock < 10000) {
1024                 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1025         } else {
1026                 /* Calculate the location for quadrant, remembering to get
1027                  * the alignment right.  Assumes that &(block[0]) is at least
1028                  * 2-byte aligned -- this should be ok since block is really
1029                  * the first section of arr2.
1030                  */
1031                 i = nblock + BZ_N_OVERSHOOT;
1032                 if (i & 1) i++;
1033                 quadrant = (uint16_t*)(&(block[i]));
1034
1035                 /* (wfact-1) / 3 puts the default-factor-30
1036                  * transition point at very roughly the same place as
1037                  * with v0.1 and v0.9.0.
1038                  * Not that it particularly matters any more, since the
1039                  * resulting compressed stream is now the same regardless
1040                  * of whether or not we use the main sort or fallback sort.
1041                  */
1042                 budget = nblock * ((wfact-1) / 3);
1043
1044                 mainSort(ptr, block, quadrant, ftab, nblock, &budget);
1045                 if (budget < 0) {
1046                         fallbackSort(s->arr1, s->arr2, ftab, nblock);
1047                 }
1048         }
1049
1050         s->origPtr = -1;
1051         for (i = 0; i < s->nblock; i++)
1052                 if (ptr[i] == 0) {
1053                         s->origPtr = i;
1054                         break;
1055                 };
1056
1057         AssertH(s->origPtr != -1, 1003);
1058 }
1059
1060
1061 /*-------------------------------------------------------------*/
1062 /*--- end                                       blocksort.c ---*/
1063 /*-------------------------------------------------------------*/