Blog coding and discussion of coding about JavaScript, PHP, CGI, general web building etc.

Friday, December 11, 2015

Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C

Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


What is the best algorithm to achieve the following:

0010 0000 => 0000 0100

The conversion is from MSB->LSB to LSB->MSB. All bits must be reversed; that is, this is not endianness-swapping.

Answer by Kip for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


I'm not sure if it the most efficient, but this should work:

//returns n rotated right by b bits  int rotRight(int n, int b)  {    if(b >= 0)      b = b%(8*sizeof(int));    else      b = 8*sizeof(int) - (-b)%(8*sizeof(int));    return (n >>> b) | (n << (8*sizeof(int) - b));  }    //returns n rotated left by b bits  int rotLeft(int n, int b)  {    if(b >= 0)      b = b%(8*sizeof(int));    else      b = 8*sizeof(int) - (-b)%(8*sizeof(int));    return (n << b) | (n >>> (8*sizeof(int) - b));  }  

Update: added support for negative "b" value.

Note: this answer was based on an earlier version of the question, which incorrectly asked for bit rotation even though that was not what was actually desired.

Answer by Matt J for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


NOTE: All algorithms below are in C, but should be portable to your language of choice (just don't look at me when they're not as fast :)

Options

Low Memory (32-bit int, 32-bit machine)(from here):

unsigned int  reverse(register unsigned int x)  {      x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));      x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));      x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));      x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));      return((x >> 16) | (x << 16));    }  

From the famous Bit Twiddling Hacks page:

Fastest (lookup table):

static const unsigned char BitReverseTable256[] =   {    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,     0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,     0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,     0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,     0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,     0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,     0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,    0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,     0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,    0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,     0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,     0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF  };    unsigned int v; // reverse 32-bit value, 8 bits at time  unsigned int c; // c will get v reversed    // Option 1:  c = (BitReverseTable256[v & 0xff] << 24) |       (BitReverseTable256[(v >> 8) & 0xff] << 16) |       (BitReverseTable256[(v >> 16) & 0xff] << 8) |      (BitReverseTable256[(v >> 24) & 0xff]);    // Option 2:  unsigned char * p = (unsigned char *) &v;  unsigned char * q = (unsigned char *) &c;  q[3] = BitReverseTable256[p[0]];   q[2] = BitReverseTable256[p[1]];   q[1] = BitReverseTable256[p[2]];   q[0] = BitReverseTable256[p[3]];  

You can extend this idea to 64-bit ints, or trade off memory for speed (assuming your L1 Data Cache is large enough), and reverse 16-bits at a time with a 64K-entry lookup table.


Others

Simple

unsigned int v;     // input bits to be reversed  unsigned int r = v; // r will be reversed bits of v; first get LSB of v  int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end    for (v >>= 1; v; v >>= 1)  {       r <<= 1;    r |= v & 1;    s--;  }  r <<= s; // shift when v's highest bits are zero  

Faster (32-bit processor)

unsigned char b = x;  b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;   

Faster (64-bit processor)

unsigned char b; // reverse this (8-bit) byte  b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;  

If you want to do this on a 32-bit int, just reverse the bits in each bytes, and reverse the order of the bytes. That is:

unsigned int toReverse;  unsigned int reversed;  unsigned char inByte0 = (toReverse & 0xFF);  unsigned char inByte1 = (toReverse & 0xFF00) >> 8;  unsigned char inByte2 = (toReverse & 0xFF0000) >> 16;  unsigned char inByte3 = (toReverse & 0xFF000000) >> 24;  reversed = (reverseBits(inByte0) << 24) | (reverseBits(inByte1) << 16) | (reverseBits(inByte2) << 8) | (reverseBits(inByte3);  

Results

I benchmarked the two most promising solutions, the lookup table, and bitwise-AND (the first one). The test machine is a laptop w/ 4GB of DDR2-800 and a Core 2 Duo T7500 @ 2.4GHz, 4MB L2 Cache; YMMV. I used gcc 4.3.2 on 64-bit Linux. OpenMP (and the GCC bindings) were used for high-resolution timers.

reverse.c

#include   #include   #include     unsigned int  reverse(register unsigned int x)  {      x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));      x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));      x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));      x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));      return((x >> 16) | (x << 16));    }    int main()  {      unsigned int *ints = malloc(100000000*sizeof(unsigned int));      unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));      for(unsigned int i = 0; i < 100000000; i++)        ints[i] = rand();        unsigned int *inptr = ints;      unsigned int *outptr = ints2;      unsigned int *endptr = ints + 100000000;      // Starting the time measurement      double start = omp_get_wtime();      // Computations to be measured      while(inptr != endptr)      {        (*outptr) = reverse(*inptr);        inptr++;        outptr++;      }      // Measuring the elapsed time      double end = omp_get_wtime();      // Time calculation (in seconds)      printf("Time: %f seconds\n", end-start);        free(ints);      free(ints2);        return 0;  }  

reverse_lookup.c

#include   #include   #include     static const unsigned char BitReverseTable256[] =   {    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,     0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,     0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,     0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,     0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,     0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,     0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,    0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,     0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,    0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,     0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,     0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF  };    int main()  {      unsigned int *ints = malloc(100000000*sizeof(unsigned int));      unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));      for(unsigned int i = 0; i < 100000000; i++)        ints[i] = rand();        unsigned int *inptr = ints;      unsigned int *outptr = ints2;      unsigned int *endptr = ints + 100000000;      // Starting the time measurement      double start = omp_get_wtime();      // Computations to be measured      while(inptr != endptr)      {      unsigned int in = *inptr;          // Option 1:      //*outptr = (BitReverseTable256[in & 0xff] << 24) |       //    (BitReverseTable256[(in >> 8) & 0xff] << 16) |       //    (BitReverseTable256[(in >> 16) & 0xff] << 8) |      //    (BitReverseTable256[(in >> 24) & 0xff]);        // Option 2:      unsigned char * p = (unsigned char *) &(*inptr);      unsigned char * q = (unsigned char *) &(*outptr);      q[3] = BitReverseTable256[p[0]];       q[2] = BitReverseTable256[p[1]];       q[1] = BitReverseTable256[p[2]];       q[0] = BitReverseTable256[p[3]];          inptr++;        outptr++;      }      // Measuring the elapsed time      double end = omp_get_wtime();      // Time calculation (in seconds)      printf("Time: %f seconds\n", end-start);        free(ints);      free(ints2);        return 0;  }  

I tried both approaches at several different optimizations, ran 3 trials at each level, and each trial reversed 100 million random unsigned ints. For the lookup table option, I tried both schemes (options 1 and 2) given on the bitwise hacks page. Results are shown below.

Bitwise AND

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse reverse.c  mrj10@mjlap:~/code$ ./reverse  Time: 2.000593 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 1.938893 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 1.936365 seconds  mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse reverse.c  mrj10@mjlap:~/code$ ./reverse  Time: 0.942709 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 0.991104 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 0.947203 seconds  mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse reverse.c  mrj10@mjlap:~/code$ ./reverse  Time: 0.922639 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 0.892372 seconds  mrj10@mjlap:~/code$ ./reverse  Time: 0.891688 seconds  

Lookup Table (option 1)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.201127 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.196129 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.235972 seconds                mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.633042 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.655880 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.633390 seconds                mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.652322 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.631739 seconds                mrj10@mjlap:~/code$ ./reverse_lookup  Time: 0.652431 seconds    

Lookup Table (option 2)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.671537 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.688173 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.664662 seconds  mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.049851 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.048403 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.085086 seconds  mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.082223 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.053431 seconds  mrj10@mjlap:~/code$ ./reverse_lookup  Time: 1.081224 seconds  

Conclusion

Use the lookup table, with option 1 (byte addressing is unsurprisingly slow) if you're concerned about performance. If you need to squeeze every last byte of memory out of your system (and you might, if you care about the performance of bit reversal), the optimized versions of the bitwise-AND approach aren't too shabby either.

Caveat

Yes, I know the benchmark code is a complete hack. Suggestions on how to improve it are more than welcome. Things I know about:

  • I don't have access to ICC. This may be faster (please respond in a comment if you can test this out).
  • A 64K lookup table may do well on some modern microarchitectures with large L1D.
  • -mtune=native didn't work for -O2/-O3 (ld blew up with some crazy symbol redefinition error), so I don't believe the generated code is tuned for my microarchitecture.
  • There may be a way to do this slightly faster with SSE. I have no idea how, but with fast replication, packed bitwise AND, and swizzling instructions, there's got to be something there.
  • I know only enough x86 assembly to be dangerous; here's the code GCC generated on -O3 for option 1, so somebody more knowledgable than myself can check it out:

32-bit

.L3:  movl    (%r12,%rsi), %ecx  movzbl  %cl, %eax  movzbl  BitReverseTable256(%rax), %edx  movl    %ecx, %eax  shrl    $24, %eax  mov     %eax, %eax  movzbl  BitReverseTable256(%rax), %eax  sall    $24, %edx  orl     %eax, %edx  movzbl  %ch, %eax  shrl    $16, %ecx  movzbl  BitReverseTable256(%rax), %eax  movzbl  %cl, %ecx  sall    $16, %eax  orl     %eax, %edx  movzbl  BitReverseTable256(%rcx), %eax  sall    $8, %eax  orl     %eax, %edx  movl    %edx, (%r13,%rsi)  addq    $4, %rsi  cmpq    $400000000, %rsi  jne     .L3  

EDIT: I also tried using uint64_t's on my machine to see if there was any performance boost. Performance was about 10% faster than 32-bit, and was nearly identical whether you were just using 64-bit types to reverse bits on two 32-bit ints at a time, or whether you were actually reversing bits in half as many 64-bit values. The assembly code is shown below (for the former case, reversing bits for 2 32-bit ints at a time):

.L3:  movq    (%r12,%rsi), %rdx  movq    %rdx, %rax  shrq    $24, %rax  andl    $255, %eax  movzbl  BitReverseTable256(%rax), %ecx  movzbq  %dl,%rax  movzbl  BitReverseTable256(%rax), %eax  salq    $24, %rax  orq     %rax, %rcx  movq    %rdx, %rax  shrq    $56, %rax  movzbl  BitReverseTable256(%rax), %eax  salq    $32, %rax  orq     %rax, %rcx  movzbl  %dh, %eax  shrq    $16, %rdx  movzbl  BitReverseTable256(%rax), %eax  salq    $16, %rax  orq     %rax, %rcx  movzbq  %dl,%rax  shrq    $16, %rdx  movzbl  BitReverseTable256(%rax), %eax  salq    $8, %rax  orq     %rax, %rcx  movzbq  %dl,%rax  shrq    $8, %rdx  movzbl  BitReverseTable256(%rax), %eax  salq    $56, %rax  orq     %rax, %rcx  movzbq  %dl,%rax  shrq    $8, %rdx  movzbl  BitReverseTable256(%rax), %eax  andl    $255, %edx  salq    $48, %rax  orq     %rax, %rcx  movzbl  BitReverseTable256(%rdx), %eax  salq    $40, %rax  orq     %rax, %rcx  movq    %rcx, (%r13,%rsi)  addq    $8, %rsi  cmpq    $400000000, %rsi  jne     .L3  

Answer by Frederick The Fool for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Presuming that you have an array of bits, how about this: 1. Starting from MSB, push bits into a stack one by one. 2. Pop bits from this stack into another array (or the same array if you want to save space), placing the first popped bit into MSB and going on to less significant bits from there.

Stack stack = new Stack();  Bit[] bits = new Bit[] { 0, 0, 1, 0, 0, 0, 0, 0 };    for (int i = 0; i < bits.Length; i++)   {      stack.push(bits[i]);  }    for (int i = 0; i < bits.Length; i++)  {      bits[i] = stack.pop();  }  

Answer by vjangus for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Generic

C code. Using 1 byte input data num as example.

    unsigned char num = 0xaa;   // 1010 1010 (aa) -> 0101 0101 (55)      int s = sizeof(num) * 8;    // get number of bits      int i, x, y, p;      int var = 0;                // make var data type to be equal or larger than num        for (i = 0; i < (s / 2); i++) {          // extract bit on the left, from MSB          p = s - i - 1;          x = num & (1 << p);          x = x >> p;          printf("x: %d\n", x);            // extract bit on the right, from LSB          y = num & (1 << i);          y = y >> i;          printf("y: %d\n", y);            var = var | (x << i);       // apply x          var = var | (y << p);       // apply y      }        printf("new: 0x%x\n", new);  

Answer by Anders Hansson for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Of course the obvious source of bit-twiddling hacks is here: http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious

Answer by Aung for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Implementation with low memory and fastest.

private Byte  BitReverse(Byte bData)      {          Byte[] lookup = { 0, 8,  4, 12,                             2, 10, 6, 14 ,                             1, 9,  5, 13,                            3, 11, 7, 15 };          Byte ret_val = (Byte)(((lookup[(bData & 0x0F)]) << 4) + lookup[((bData & 0xF0) >> 4)]);          return ret_val;      }  

Answer by wildplasser for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Well, this is basically the same as the first "reverse()" but it is 64 bit and only needs one immediate mask to be loaded from the instruction stream. GCC creates code without jumps, so this should be pretty fast.

#include     static unsigned long long swap64(unsigned long long val)  {  #define ZZZZ(x,s,m) (((x) >>(s)) & (m)) | (((x) & (m))<<(s));  /* val = (((val) >>16) & 0xFFFF0000FFFF) | (((val) & 0xFFFF0000FFFF)<<16); */    val = ZZZZ(val,32,  0x00000000FFFFFFFFull );  val = ZZZZ(val,16,  0x0000FFFF0000FFFFull );  val = ZZZZ(val,8,   0x00FF00FF00FF00FFull );  val = ZZZZ(val,4,   0x0F0F0F0F0F0F0F0Full );  val = ZZZZ(val,2,   0x3333333333333333ull );  val = ZZZZ(val,1,   0x5555555555555555ull );    return val;  #undef ZZZZ  }    int main(void)  {  unsigned long long val, aaaa[16] =   { 0xfedcba9876543210,0xedcba9876543210f,0xdcba9876543210fe,0xcba9876543210fed   , 0xba9876543210fedc,0xa9876543210fedcb,0x9876543210fedcba,0x876543210fedcba9   , 0x76543210fedcba98,0x6543210fedcba987,0x543210fedcba9876,0x43210fedcba98765   , 0x3210fedcba987654,0x210fedcba9876543,0x10fedcba98765432,0x0fedcba987654321   };  unsigned iii;    for (iii=0; iii < 16; iii++) {      val = swap64 (aaaa[iii]);      printf("A[]=%016llX Sw=%016llx\n", aaaa[iii], val);      }  return 0;  }  

Answer by Dennis Mathews for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


This is another solution for folks who love recursion.

The idea is simple. Divide up input by half and swap the two halves, continue till it it reaches single bit.

Illustrated in the example below.    Ex : If Input is 00101010   ==> Expected output is 01010100    1.  Divide the input into 2 halves       0010 --- 1010    2. Swap the 2 Halves      1010     0010    3. Repeat the same for each half.      10 -- 10 ---  00 -- 10      10    10      10    00        1-0 -- 1-0 --- 1-0 -- 0-0      0 1    0 1     0 1    0 0    Done! Output is 01010100  

Here is a recursive function to solve it. (Note I have used unsigned ints, so it can work for inputs upto sizeof(unsigned int)*8 bits.

The recursive function takes 2 parameters - The value whose bits need to be reversed and the number of bits in the value.

int reverse_bits_recursive(unsigned int num, unsigned int numBits)  {      unsigned int reversedNum;;      unsigned int mask = 0;        mask = (0x1 << (numBits/2)) - 1;        if (numBits == 1) return num;      reversedNum = reverse_bits_recursive(num >> numBits/2, numBits/2) |                     reverse_bits_recursive((num & mask), numBits/2) << numBits/2;      return reversedNum;  }    int main()  {      unsigned int reversedNum;      unsigned int num;        num = 0x55;      reversedNum = reverse_bits_recursive(num, 8);      printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);        num = 0xabcd;      reversedNum = reverse_bits_recursive(num, 16);      printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);        num = 0x123456;      reversedNum = reverse_bits_recursive(num, 24);      printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);        num = 0x11223344;      reversedNum = reverse_bits_recursive(num,32);      printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);  }  

This is the output:

Bit Reversal Input = 0x55 Output = 0xaa  Bit Reversal Input = 0xabcd Output = 0xb3d5  Bit Reversal Input = 0x123456 Output = 0x651690  Bit Reversal Input = 0x11223344 Output = 0x22cc4488  

Answer by Cem for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


You might want to use the standard template library. It might be slower than the above mentioned code. However, it seems to me clearer and easier to understand.

 #include   #include       template   const std::bitset reverse(const std::bitset& ordered)   {        std::bitset reversed;        for(size_t i = 0, j = N - 1; i < N; ++i, --j)             reversed[j] = ordered[i];        return reversed;   };       // test the function   int main()   {        unsigned long num;         const size_t N = sizeof(num)*8;          std::cin >> num;        std::cout << std::showbase << std::hex;        std::cout << "ordered  = " << num << std::endl;        std::cout << "reversed = " << reverse(num).to_ulong()  << std::endl;        std::cout << "double_reversed = " << reverse(reverse(num)).to_ulong() << std::endl;     }  

Answer by BlueAutumn for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


How about the following:

    uint reverseMSBToLSB32ui(uint input)      {          uint output = 0x00000000;          uint toANDVar = 0;          int places = 0;            for (int i = 1; i < 32; i++)          {              places = (32 - i);              toANDVar = (uint)(1 << places);              output |= (uint)(input & (toANDVar)) >> places;            }              return output;      }  

Small and easy (though, 32 bit only).

Answer by rahul for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


int main() {      int n;      scanf("%d", &n);      while (n) {          if (n & 1)              printf("1");          else              printf("0");          n >>= 1;      }      printf("\n");  }    Output:  25  10011  

Answer by Coco for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


I know its not C but asm:

var1 dw 0f0f0  clc  ...
push ax push cx mov cx 16 loop1: shl var1 shr ax loop loop1 pop ax pop cx

This works with the carry bit, so you may save flags too

Answer by Peter Sikora for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Bit reversal in pseudo code

source -> byte to be reversed b00101100 destination -> reversed, also needs to be of unsigned type so sign bit is not propogated down

copy into temp so original is unaffected, also needs to be of unsigned type so that sign bit is not shifted in automaticaly

bytecopy = b0010110  

LOOP8: //do this 8 times test if bytecopy is < 0 (negative)

    set bit8 (msb) of reversed = reversed | b10000000     else do not set bit8    shift bytecopy left 1 place  bytecopy = bytecopy << 1 = b0101100 result    shift result right 1 place  reversed = reversed >> 1 = b00000000  8 times no then up^ LOOP8  8 times yes then done.  

Answer by Khulja Sim Sim for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


The Question asked is for reversing a byte (8 Bits of data)

typedef unsigned char byte;    byte reverseByte(byte a)  {      int i;      byte b = 0;        for ( i = 0 ; i < 8 ; i ++)      {          b <<= 1;          b |=  ( (a & (1 << i)) >> i);                }      return b;   }  

Answer by GuyGreer for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


Well this certainly won't be an answer like Matt J's but hopefully it will still be useful.

    unsigned int reverse(unsigned int t)      {          size_t n = t;//store in 64 bit number for call to BSWAP          __asm__("BSWAP %0" : "=r"(n) : "0"(n));          n >>= ((sizeof(size_t) - sizeof(unsigned int)) * 8);          n = ((n & 0xaaaaaaaa) >> 1) | ((n & 0x55555555) << 1);          n = ((n & 0xcccccccc) >> 2) | ((n & 0x33333333) << 2);          n = ((n & 0xf0f0f0f0) >> 4) | ((n & 0x0f0f0f0f) << 4);          return n;      }  

This is exactly the same idea as Matt's best algorithm except that there's this little instruction called BSWAP which swaps the bytes (not the bits) of a 64-bit number. So b7,b6,b5,b4,b3,b2,b1,b0 becomes b0,b1,b2,b3,b4,b5,b6,b7. Since we are working with a 32-bit number we need to shift our byte-swapped number down 32 bits. This just leaves us with the task of swapping the 8 bits of each byte which is done and voila! we're done.

Timing: on my machine, Matt's algorithm ran in ~0.52 seconds per trial. Mine ran in about 0.42 seconds per trial. 20% faster is not bad I think.

If you're worried about the availability of the instruction BSWAP Wikipedia lists the instruction BSWAP as being added with 80846 which came out in 1989. It should be noted that Wikipedia also states that this instruction only works on 32 bit registers which is clearly not the case on my machine, it very much works only on 64-bit registers.

This method will work equally well for any integral datatype so the method can be generalized trivially by passing the number of bytes desired:

    size_t reverse(size_t n, unsigned int bytes)      {          __asm__("BSWAP %0" : "=r"(n) : "0"(n));          n >>= ((sizeof(size_t) - bytes) * 8);          n = ((n & 0xaaaaaaaa) >> 1) | ((n & 0x55555555) << 1);          n = ((n & 0xcccccccc) >> 2) | ((n & 0x33333333) << 2);          n = ((n & 0xf0f0f0f0) >> 4) | ((n & 0x0f0f0f0f) << 4);          return n;      }  

which can then be called like:

    n = reverse(n, sizeof(char));//only reverse 8 bits      n = reverse(n, sizeof(short));//reverse 16 bits      n = reverse(n, sizeof(int));//reverse 32 bits      n = reverse(n, sizeof(size_t));//reverse 64 bits  

The compiler should be able to optimize the extra parameter away (assuming the compiler inlines the function) and for the sizeof(size_t) case the right-shift would be removed completely. Note that GCC at least is not able to remove the BSWAP and right-shift if passed sizeof(char).

Answer by user3615967 for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


unsigned char ReverseBits(unsigned char data)  {      unsigned char k = 0, rev = 0;        unsigned char n = data;        while(n)        {          k = n & (~(n - 1));          n &= (n - 1);          rev |= (128 / k);      }      return rev;  }  

Answer by Anders Cedronius for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


This thread caught my attention since it deals with a simple problem that requires a lot of work (CPU cycles) even for a modern CPU. And one day I also stood there with the same #%"#" problem. I had to flip millions of bytes. However I know all my target systems are modern Intel based so lets start optimizing to the extreme!!!

So I used Matt J's lookup code as the base. the system I'm benchmarking on is a i7 haswell 4700eq.

Matt J's lookup bitflipping 400 000 000 bytes: Around 0.272 seconds.

I then went ahead and tried to see if Intels ISPC compiler could vectorise the arithmetics in the reverse.c.

I'm not going to bore you with my findings here since I tried a lot to help the compiler find stuff, anyhow I ended up with performance of around 0.15 seconds to bitflipp 400 000 000 bytes. it's a great reduction but for my application that's still way way to slow..

So people let me present the fastest intel based bitflipper in the world. Clocked at:

Time to bitflip 400000000 bytes: 0.050082 seconds !!!!!

// Bitflip using AVX2 - The fastest Intel based bitflip in the world!!  // Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)    #include   #include   #include   #include     using namespace std;    #define DISPLAY_HEIGHT  4  #define DISPLAY_WIDTH   32  #define NUM_DATA_BYTES  400000000    // Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)  __attribute__ ((aligned(32))) static unsigned char k1[32*3]={          0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,          0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,          0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0  };    // The data to be bitflipped (+32 to avoid the quantization out of memory problem)  __attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};    extern "C" {  void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);  }    int main()  {        for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)      {          data[i] = rand();      }        printf ("\r\nData in(start):\r\n");      for (unsigned int j = 0; j < 4; j++)      {          for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)          {              printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);          }          printf ("\r\n");      }        printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));        double start_time = omp_get_wtime();      bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);      double end_time = omp_get_wtime();        printf ("\r\nData out:\r\n");      for (unsigned int j = 0; j < 4; j++)      {          for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)          {              printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);          }          printf ("\r\n");      }      printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);        // return with no errors      return 0;  }  

The pritf's for debuging..

Here is the workhorse:

bits 64  global bitflipbyte    bitflipbyte:              vmovdqa     ymm2, [rdx]          add         rdx, 20h          vmovdqa     ymm3, [rdx]          add         rdx, 20h          vmovdqa     ymm4, [rdx]  bitflipp_loop:          vmovdqa     ymm0, [rdi]           vpand       ymm1, ymm2, ymm0           vpandn      ymm0, ymm2, ymm0           vpsrld      ymm0, ymm0, 4h           vpshufb     ymm1, ymm4, ymm1           vpshufb     ymm0, ymm3, ymm0                   vpor        ymm0, ymm0, ymm1          vmovdqa     [rdi], ymm0          add     rdi, 20h          dec     rsi          jnz     bitflipp_loop          ret  

The code takes 32 bytes then masks out the nibbles. The high nibble gets shifted right by 4. then I use vpshufb and ymm4 / ymm3 as lookup tables. I could use a single lookup table but then I would have to shift left before oring the nibbles together again.

There are even faster ways of flipping the bits. But I'm bound to single thread and CPU so this was the fastest I could achieve. Can you make a faster version?

Please make no comments about using the Intel C/C++ Compiler Intrinsic Equivalent commands...

Answer by John Jennings for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


I think the simplest method I know follows. MSB is input and LSB is 'reversed' output:

unsigned char rev(char MSB) {      unsigned char LSB=0;  // for output      _FOR(i,0,8) {          LSB= LSB << 1;          if(MSB&1) LSB = LSB | 1;          MSB= MSB >> 1;      }      return LSB;  }    //    It works by rotating bytes in opposite directions.   //    Just repeat for each byte.  

Answer by MikhailJacques for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


// Purpose: to reverse bits in an unsigned short integer   // Input: an unsigned short integer whose bits are to be reversed  // Output: an unsigned short integer with the reversed bits of the input one  unsigned short ReverseBits( unsigned short a )  {       // declare and initialize number of bits in the unsigned short integer       const char num_bits = sizeof(a) * CHAR_BIT;         // declare and initialize bitset representation of integer a       bitset bitset_a(a);                   // declare and initialize bitset representation of integer b (0000000000000000)       bitset bitset_b(0);                           // declare and initialize bitset representation of mask (0000000000000001)       bitset mask(1);                   for ( char i = 0; i < num_bits; ++i )       {            bitset_b = (bitset_b << 1) | bitset_a & mask;            bitset_a >>= 1;       }         return (unsigned short) bitset_b.to_ulong();  }    void PrintBits( unsigned short a )  {       // declare and initialize bitset representation of a       bitset bitset(a);         // print out bits       cout << bitset << endl;  }      // Testing the functionality of the code    int main ()  {       unsigned short a = 17, b;         cout << "Original: ";        PrintBits(a);         b = ReverseBits( a );         cout << "Reversed: ";       PrintBits(b);  }    // Output:  Original: 0000000000010001  Reversed: 1000100000000000  

Answer by marian adam for Best Algorithm for Bit Reversal ( from MSB->LSB to LSB->MSB) in C


I was curious how fast would be the obvious raw rotation. On my machine (i7@2600), the average for 1,500,150,000 iterations was 27.28 ns (over a a random set of 131,071 64-bit integers).

Advantages: the amount of memory needed is little and the code is simple. I would say it is not that large, either. The time required is predictable and constant for any input (128 arithmetic SHIFT operations + 64 logical AND operations + 64 logical OR operations).

I compared to the best time obtained by @Matt J - who has the accepted answer. If I read his answer correctly, the best he has got was 0.631739 seconds for 1,000,000 iterations, which leads to an average of 631 ns per rotation.

The code snippet I used is this one below:

unsigned long long reverse_long(unsigned long long x)  {      return (((x >> 0) & 1) << 63) |             (((x >> 1) & 1) << 62) |             (((x >> 2) & 1) << 61) |             (((x >> 3) & 1) << 60) |             (((x >> 4) & 1) << 59) |             (((x >> 5) & 1) << 58) |             (((x >> 6) & 1) << 57) |             (((x >> 7) & 1) << 56) |             (((x >> 8) & 1) << 55) |             (((x >> 9) & 1) << 54) |             (((x >> 10) & 1) << 53) |             (((x >> 11) & 1) << 52) |             (((x >> 12) & 1) << 51) |             (((x >> 13) & 1) << 50) |             (((x >> 14) & 1) << 49) |             (((x >> 15) & 1) << 48) |             (((x >> 16) & 1) << 47) |             (((x >> 17) & 1) << 46) |             (((x >> 18) & 1) << 45) |             (((x >> 19) & 1) << 44) |             (((x >> 20) & 1) << 43) |             (((x >> 21) & 1) << 42) |             (((x >> 22) & 1) << 41) |             (((x >> 23) & 1) << 40) |             (((x >> 24) & 1) << 39) |             (((x >> 25) & 1) << 38) |             (((x >> 26) & 1) << 37) |             (((x >> 27) & 1) << 36) |             (((x >> 28) & 1) << 35) |             (((x >> 29) & 1) << 34) |             (((x >> 30) & 1) << 33) |             (((x >> 31) & 1) << 32) |             (((x >> 32) & 1) << 31) |             (((x >> 33) & 1) << 30) |             (((x >> 34) & 1) << 29) |             (((x >> 35) & 1) << 28) |             (((x >> 36) & 1) << 27) |             (((x >> 37) & 1) << 26) |             (((x >> 38) & 1) << 25) |             (((x >> 39) & 1) << 24) |             (((x >> 40) & 1) << 23) |             (((x >> 41) & 1) << 22) |             (((x >> 42) & 1) << 21) |             (((x >> 43) & 1) << 20) |             (((x >> 44) & 1) << 19) |             (((x >> 45) & 1) << 18) |             (((x >> 46) & 1) << 17) |             (((x >> 47) & 1) << 16) |             (((x >> 48) & 1) << 15) |             (((x >> 49) & 1) << 14) |             (((x >> 50) & 1) << 13) |             (((x >> 51) & 1) << 12) |             (((x >> 52) & 1) << 11) |             (((x >> 53) & 1) << 10) |             (((x >> 54) & 1) << 9) |             (((x >> 55) & 1) << 8) |             (((x >> 56) & 1) << 7) |             (((x >> 57) & 1) << 6) |             (((x >> 58) & 1) << 5) |             (((x >> 59) & 1) << 4) |             (((x >> 60) & 1) << 3) |             (((x >> 61) & 1) << 2) |             (((x >> 62) & 1) << 1) |             (((x >> 63) & 1) << 0);  }  

0 comments:

Post a Comment

Popular Posts

Powered by Blogger.