我有一个指向混合字节数组的指针,它包含两个不同的数组array1和array2的交错字节.说混合看起来像这样:
a1b2c3d4...
我需要做的是对字节进行解交织,所以我得到array1 = abcd …和array2 = 1234 ….我知道提前混合的长度,array1和array2的长度是等效的混合/ 2.
这是我当前的实现(array1和array2已经分配):
int i,j; int mixedLength_2 = mixedLength / 2; for (i = 0,j = 0; i < mixedLength_2; i++,j += 2) { array1[i] = mixed[j]; array2[i] = mixed[j+1]; }
这避免了任何昂贵的乘法或除法运算,但仍然运行不够快.我希望有一些像memcpy这样的东西,它使用可以使用低级块复制操作来加速进程的索引器.比现在有更快的实现吗?
编辑
目标平台是iOS和Mac的Objective-C.一个快速的操作对于iOS设备来说更为重要,因此针对iOS的解决方案会比没有更好.
更新
感谢大家的回应,特别是斯蒂芬佳能,格雷厄姆李和梅奇.这是我的“主”功能,它使用史蒂芬的NEON内在函数(如果可用),否则格雷厄姆的联合游标与Mecki所建议的迭代次数相对减少.
void interleave(const uint8_t *srcA,const uint8_t *srcB,uint8_t *dstAB,size_t dstABLength) { #if defined __ARM_NEON__ // attempt to use NEON intrinsics // iterate 32-bytes at a time div_t dstABLength_32 = div(dstABLength,32); if (dstABLength_32.rem == 0) { while (dstABLength_32.quot --> 0) { const uint8x16_t a = vld1q_u8(srcA); const uint8x16_t b = vld1q_u8(srcB); const uint8x16x2_t ab = { a,b }; vst2q_u8(dstAB,ab); srcA += 16; srcB += 16; dstAB += 32; } return; } // iterate 16-bytes at a time div_t dstABLength_16 = div(dstABLength,16); if (dstABLength_16.rem == 0) { while (dstABLength_16.quot --> 0) { const uint8x8_t a = vld1_u8(srcA); const uint8x8_t b = vld1_u8(srcB); const uint8x8x2_t ab = { a,b }; vst2_u8(dstAB,ab); srcA += 8; srcB += 8; dstAB += 16; } return; } #endif // if the bytes were not aligned properly // or NEON is unavailable,fall back to // an optimized iteration // iterate 8-bytes at a time div_t dstABLength_8 = div(dstABLength,8); if (dstABLength_8.rem == 0) { typedef union { uint64_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow; } ab8x8_t; uint64_t *dstAB64 = (uint64_t *)dstAB; int j = 0; for (int i = 0; i < dstABLength_8.quot; i++) { ab8x8_t cursor; cursor.narrow.a1 = srcA[j ]; cursor.narrow.b1 = srcB[j++]; cursor.narrow.a2 = srcA[j ]; cursor.narrow.b2 = srcB[j++]; cursor.narrow.a3 = srcA[j ]; cursor.narrow.b3 = srcB[j++]; cursor.narrow.a4 = srcA[j ]; cursor.narrow.b4 = srcB[j++]; dstAB64[i] = cursor.wide; } return; } // iterate 4-bytes at a time div_t dstABLength_4 = div(dstABLength,4); if (dstABLength_4.rem == 0) { typedef union { uint32_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow; } ab8x4_t; uint32_t *dstAB32 = (uint32_t *)dstAB; int j = 0; for (int i = 0; i < dstABLength_4.quot; i++) { ab8x4_t cursor; cursor.narrow.a1 = srcA[j ]; cursor.narrow.b1 = srcB[j++]; cursor.narrow.a2 = srcA[j ]; cursor.narrow.b2 = srcB[j++]; dstAB32[i] = cursor.wide; } return; } // iterate 2-bytes at a time div_t dstABLength_2 = div(dstABLength,2); typedef union { uint16_t wide; struct { uint8_t a; uint8_t b; } narrow; } ab8x2_t; uint16_t *dstAB16 = (uint16_t *)dstAB; for (int i = 0; i < dstABLength_2.quot; i++) { ab8x2_t cursor; cursor.narrow.a = srcA[i]; cursor.narrow.b = srcB[i]; dstAB16[i] = cursor.wide; } } void deinterleave(const uint8_t *srcAB,uint8_t *dstA,uint8_t *dstB,size_t srcABLength) { #if defined __ARM_NEON__ // attempt to use NEON intrinsics // iterate 32-bytes at a time div_t srcABLength_32 = div(srcABLength,32); if (srcABLength_32.rem == 0) { while (srcABLength_32.quot --> 0) { const uint8x16x2_t ab = vld2q_u8(srcAB); vst1q_u8(dstA,ab.val[0]); vst1q_u8(dstB,ab.val[1]); srcAB += 32; dstA += 16; dstB += 16; } return; } // iterate 16-bytes at a time div_t srcABLength_16 = div(srcABLength,16); if (srcABLength_16.rem == 0) { while (srcABLength_16.quot --> 0) { const uint8x8x2_t ab = vld2_u8(srcAB); vst1_u8(dstA,ab.val[0]); vst1_u8(dstB,ab.val[1]); srcAB += 16; dstA += 8; dstB += 8; } return; } #endif // if the bytes were not aligned properly // or NEON is unavailable,fall back to // an optimized iteration // iterate 8-bytes at a time div_t srcABLength_8 = div(srcABLength,8); if (srcABLength_8.rem == 0) { typedef union { uint64_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow; } ab8x8_t; uint64_t *srcAB64 = (uint64_t *)srcAB; int j = 0; for (int i = 0; i < srcABLength_8.quot; i++) { ab8x8_t cursor; cursor.wide = srcAB64[i]; dstA[j ] = cursor.narrow.a1; dstB[j++] = cursor.narrow.b1; dstA[j ] = cursor.narrow.a2; dstB[j++] = cursor.narrow.b2; dstA[j ] = cursor.narrow.a3; dstB[j++] = cursor.narrow.b3; dstA[j ] = cursor.narrow.a4; dstB[j++] = cursor.narrow.b4; } return; } // iterate 4-bytes at a time div_t srcABLength_4 = div(srcABLength,4); if (srcABLength_4.rem == 0) { typedef union { uint32_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow; } ab8x4_t; uint32_t *srcAB32 = (uint32_t *)srcAB; int j = 0; for (int i = 0; i < srcABLength_4.quot; i++) { ab8x4_t cursor; cursor.wide = srcAB32[i]; dstA[j ] = cursor.narrow.a1; dstB[j++] = cursor.narrow.b1; dstA[j ] = cursor.narrow.a2; dstB[j++] = cursor.narrow.b2; } return; } // iterate 2-bytes at a time div_t srcABLength_2 = div(srcABLength,2); typedef union { uint16_t wide; struct { uint8_t a; uint8_t b; } narrow; } ab8x2_t; uint16_t *srcAB16 = (uint16_t *)srcAB; for (int i = 0; i < srcABLength_2.quot; i++) { ab8x2_t cursor; cursor.wide = srcAB16[i]; dstA[i] = cursor.narrow.a; dstB[i] = cursor.narrow.b; } }
解决方法
在我的头顶,我不知道一个库函数去交织2通道字节数据.然而,值得一提的是,苹果提出了一个错误报告来请求这样的功能.
在此期间,使用NEON或SSE内在函数对这样的函数进行向量化很容易.具体来说,在ARM上,您将需要使用vld1q_u8从每个源数组vuzpq_u8加载一个向量来对其进行解交织,并使用vst1q_u8来存储结果向量;这是一个粗略的草图,我没有测试甚至试图构建,但它应该说明一般的想法.更复杂的实现是绝对可能的(特别是NEON可以在单个指令中加载/存储两个16B寄存器,编译器可能无法执行此操作,根据缓冲区的长度,一些流水线和/或展开可能是有益的是):
#if defined __ARM_NEON__ # include <arm_neon.h> #endif #include <stdint.h> #include <stddef.h> void deinterleave(uint8_t *mixed,uint8_t *array1,uint8_t *array2,size_t mixedLength) { #if defined __ARM_NEON__ size_t vectors = mixedLength / 32; mixedLength %= 32; while (vectors --> 0) { const uint8x16_t src0 = vld1q_u8(mixed); const uint8x16_t src1 = vld1q_u8(mixed + 16); const uint8x16x2_t dst = vuzpq_u8(src0,src1); vst1q_u8(array1,dst.val[0]); vst1q_u8(array2,dst.val[1]); mixed += 32; array1 += 16; array2 += 16; } #endif for (size_t i=0; i<mixedLength/2; ++i) { array1[i] = mixed[2*i]; array2[i] = mixed[2*i + 1]; } }