我正在使用BGR的SIMD优化来进行灰度转换,相当于
OpenCV’s
cvtColor()
function.有一个Intel SSE版本的这个功能,我指的是它. (我正在做的是基本上将SSE代码转换为NEON代码.)
我几乎完成了编写代码,可以用g编译它,但是我无法得到正确的输出.有没有人有任何想法可能是什么错误?
我得到什么(不正确):
我应该得到什么
这是我的代码:
#include <opencv/cv.hpp> #include <opencv/highgui.h> #include <arm_neon.h> //#include <iostream> using namespace std; //using namespace cv; #define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v),vget_high_s8(v) }) void cvtBGR2GrayNEON(cv::Mat& src,cv::Mat& dest) { const int size = src.size().area()*src.channels(); uchar* s = src.ptr<uchar>(0); uchar* d = dest.ptr<uchar>(0); const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; const int8x16_t smask1 = {6,14,15}; const int8x16_t ssmask1 = {11,10}; const int8x16_t mask2 = {0,13}; const int8x16_t ssmask2 = {0,10}; const int8x16_t bmask1 = {255,255,0}; const int8x16_t bmask2 = {255,0}; const int8x16_t bmask3 = {255,0}; const int8x16_t bmask4 = {255,0}; const int shift = 8; const int amp = 1<<shift; const int16_t _R_ = (int16_t)(amp*0.299); const int16_t _G_ = (int16_t)(amp*0.587); const int16_t _B_ = (int16_t)(amp*0.114); const int16x8_t R = vdupq_n_s16(_R_); const int16x8_t G = vdupq_n_s16(_G_); const int16x8_t B = vdupq_n_s16(_B_); const int8x16_t zero = vdupq_n_s8(0); for(int i = 0; i < size; i += 48) { int8x16_t a = vld1q_s8((int8_t *) s + i); int8x16_t b = vld1q_s8((int8_t *) s + i + 16); int8x16_t c = vld1q_s8((int8_t *) s + i + 32); a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1))); b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b),vget_low_s8(mask2)),vtbl2_s8(int8x16_to_8x8x2(b),vget_high_s8(mask2))); c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c),vtbl2_s8(int8x16_to_8x8x2(c),vget_high_s8(mask2))); //BBBBBB const int8x16_t aaaa = vbslq_s8(c,vbslq_s8(b,a,bmask1),bmask2); a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(smask1)),vget_high_s8(smask1))); b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b),vget_high_s8(smask1))); c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c),vget_high_s8(smask1))); //GGGGGG const int8x16_t bbbb = vbslq_s8(c,bmask3),vget_low_s8(ssmask1)),vget_high_s8(ssmask1))); c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c),vget_high_s8(ssmask1))); b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b),vget_low_s8(ssmask2)),vget_high_s8(ssmask2))); //RRRRRR const int8x16_t cccc = vbslq_s8(c,bmask4); /* int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa),vget_high_s8(zero)); int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa),vget_low_s8(zero)); */ int8x16_t a1 = aaaa; int8x16_t a2 = zero; int8x16x2_t temp1 = vzipq_s8(a1,a2); a1 = temp1.val[0]; a2 = temp1.val[1]; int16x8_t aa1 = vmulq_s16((int16x8_t)a2,B); int16x8_t aa2 = vmulq_s16((int16x8_t)a1,B); int8x16_t b1 = bbbb; int8x16_t b2 = zero; int8x16x2_t temp2 = vzipq_s8(b1,b2); b1 = temp2.val[0]; b2 = temp2.val[1]; int16x8_t bb1 = vmulq_s16((int16x8_t)b2,G); int16x8_t bb2 = vmulq_s16((int16x8_t)b1,G); int8x16_t c1 = cccc; int8x16_t c2 = zero; int8x16x2_t temp3 = vzipq_s8(c1,c2); c1 = temp3.val[0]; c2 = temp3.val[1]; int16x8_t cc1 = vmulq_s16((int16x8_t)c2,R); int16x8_t cc2 = vmulq_s16((int16x8_t)c1,R); aa1 = vaddq_s16(aa1,bb1); aa1 = vaddq_s16(aa1,cc1); aa2 = vaddq_s16(aa2,bb2); aa2 = vaddq_s16(aa2,cc2); const int shift1 = 8; aa1 = vshrq_n_s16(aa1,shift1); aa2 = vshrq_n_s16(aa2,shift1); uint8x8_t aaa1 = vqmovun_s16(aa1); uint8x8_t aaa2 = vqmovun_s16(aa2); uint8x16_t result = vcombine_u8(aaa1,aaa2); vst1q_u8((uint8_t *)(d),result); d+=16; } } int main() { cv::Mat src = cv::imread("Lenna.bmp"); cv::Mat dest(src.rows,src.cols,CV_8UC1); cvtBGR2GrayNEON(src,dest); cv::imwrite("grey.jpg",dest); return 0; }
void cvtBGR2GraySSEShort(Mat& src,Mat& dest) { const int size = src.size().area()*src.channels(); uchar* s = src.ptr<uchar>(0); uchar* d = dest.ptr<uchar>(0); //data structure //BGR BGR BGR BGR BGR B //GR BGR BGR BGR BGR BG //R BGR BGR BGR BGR BGR //shuffle to BBBBBBGGGGGRRRRR const __m128i mask1 = _mm_setr_epi8(0,14); const __m128i smask1 = _mm_setr_epi8(6,15); const __m128i ssmask1 = _mm_setr_epi8(11,10); //shuffle to GGGGGGBBBBBRRRRR const __m128i mask2 = _mm_setr_epi8(0,13); //const __m128i smask2 = _mm_setr_epi8(6,15);same as smask1 const __m128i ssmask2 = _mm_setr_epi8(0,10); //shuffle to RRRRRRGGGGGBBBBB //__m128i mask3 = _mm_setr_epi8(0,13);//same as mask2 //const __m128i smask3 = _mm_setr_epi8(6,10);//same as smask1 //const __m128i ssmask3 = _mm_setr_epi8(11,10);//same as ssmask1 //blend mask const __m128i bmask1 = _mm_setr_epi8 (255,0); const __m128i bmask2 = _mm_setr_epi8 (255,0); const __m128i bmask3 = _mm_setr_epi8 (255,0); const __m128i bmask4 = _mm_setr_epi8 (255,0); const int shift = 8; const int amp = 1<<shift; const int _R_=(int)(amp*0.299); const int _G_=(int)(amp*0.587); const int _B_=(int)(amp*0.114); const __m128i R = _mm_set1_epi16(_R_); const __m128i G = _mm_set1_epi16(_G_); const __m128i B = _mm_set1_epi16(_B_); const __m128i zero = _mm_setzero_si128(); for(int i=0;i<size;i+=48) { __m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1); __m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2); __m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2); const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,bmask2); a = _mm_shuffle_epi8(a,smask1); b = _mm_shuffle_epi8(b,smask1); c = _mm_shuffle_epi8(c,smask1); const __m128i bbbb =_mm_blendv_epi8(c,ssmask1); c = _mm_shuffle_epi8(c,ssmask1); b = _mm_shuffle_epi8(b,ssmask2); const __m128i cccc =_mm_blendv_epi8(c,bmask4); __m128i a1 = _mm_unpackhi_epi8(aaaa,zero); __m128i a2 = _mm_unpacklo_epi8(aaaa,zero); a1 = _mm_mullo_epi16(a1,B); a2 = _mm_mullo_epi16(a2,B); __m128i b1 = _mm_unpackhi_epi8(bbbb,zero); __m128i b2 = _mm_unpacklo_epi8(bbbb,zero); b1 = _mm_mullo_epi16(b1,G); b2 = _mm_mullo_epi16(b2,G); __m128i c1 = _mm_unpackhi_epi8(cccc,zero); __m128i c2 = _mm_unpacklo_epi8(cccc,zero); c1 = _mm_mullo_epi16(c1,R); c2 = _mm_mullo_epi16(c2,R); a1 = _mm_add_epi16(a1,b1); a1 = _mm_add_epi16(a1,c1); a2 = _mm_add_epi16(a2,b2); a2 = _mm_add_epi16(a2,c2); a1 = _mm_srli_epi16(a1,8); a2 = _mm_srli_epi16(a2,8); a = _mm_packus_epi16(a1,a2); _mm_stream_si128((__m128i*)(d),a); d+=16; } }
解决方法
好的,下面是我刚刚写的那个函数的完全优化版本(请注意,如果大小小于32,则此函数只返回)
/* * Created on: 2014. 7. 27. * Author: Jake Lee * Project FANIC - Fastest ARM NEON Implementaion Challenge */ // void fanicCvtBGR2GrayNEON(void *pDst,void *pSrc,unsigned int size); // Y = 0.114*B + 0.587*G + 0.299*R .text .arm .global fanicCvtBGR2GrayNEON pDst .req r0 pSrc .req r1 size .req r2 .align 5 .func fanicCvtBGR2GrayNEON: pld [pSrc] subs size,size,#32 pld [pSrc,#64] bxmi lr pld [pSrc,#64*2] vmov.i8 d0,#29 vmov.i8 d1,#150 vmov.i8 d2,#77 .align 5 1: vld3.8 {d20,d21,d22},[pSrc]! vld3.8 {d23,d24,d25},[pSrc]! vld3.8 {d26,d27,d28},[pSrc]! vld3.8 {d29,d30,d31},[pSrc]! vmull.u8 q8,d20,d0 vmlal.u8 q8,d1 vmlal.u8 q8,d22,d2 vmull.u8 q9,d23,d0 vmlal.u8 q9,d1 vmlal.u8 q9,d25,d2 vmull.u8 q10,d26,d0 vmlal.u8 q10,d1 vmlal.u8 q10,d28,d2 vmull.u8 q11,d29,d0 vmlal.u8 q11,d1 vmlal.u8 q11,d31,d2 vrshrn.u16 d24,q8,#8 vrshrn.u16 d25,q9,#8 vrshrn.u16 d26,q10,#8 vrshrn.u16 d27,q11,#8 subs size,#64*3] pld [pSrc,#64*4] vst1.8 {q12,q13},[pDst]! bpl 1b cmp size,#-32 add pSrc,pSrc,size bxle lr add pSrc,lsl #1 add pDst,pDst,size b 1b .endfunc .end
正如你所看到的,在组装中编写NEON代码比在内在函数中更容易和更短,尽管重度展开.
玩的开心.