|
发表于 2023-11-1 15:51:53
|
显示全部楼层
感觉这次擂台比上次的 UInt96x96To192() 更有意义,
<br>为了大家方便交流和测试,我做了一个模板:仅一个.c文件再加一个dsp文件即可。
<br>
<br>其中为了方便大家自测算法的正确性,我已完成了一个标准c的版本,
<br>该版本的结果与我用HugeCalc的一致,并经过反复推敲论证过的,应该是无bug且高效的。
<br>
代码如下:
<pre>
/************************************************************************/
/* UInt128x128To256.c */
/************************************************************************/
#include < wtypes.h >
#include < stdio.h >
#if 0
# define _RAND_TEST /* 随机测试 */
#endif
#ifndef _DEBUG /* 测试次数 */
# define _TEST_TIMES 10000000UL
#else
# define _TEST_TIMES 1UL /* 先DEBUG算法的正确性 */
#endif
typedef void ( *lpfn_UInt128x128To256 )( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right );
static BOOL s_bSupport_MMX = FALSE;
static BOOL s_bSupport_SSE = FALSE;
static BOOL s_bSupport_SSE2 = FALSE;
static UINT64 s_u64Frequency = 1;
static UINT64 s_u64Start, s_u64End;
static BYTE buffer[(4+4+8)*4+15];
static UINT32 * left = NULL;
static UINT32 * right = NULL;
static UINT32 * result = NULL;
void initParam( void )
{
/* 16字节对齐,以便于 SSE/SSE2 优化 */
left = (UINT32 *)( ((UINT32)( buffer+15 )) & -16 );
right = left + 4;
result = right + 4;
/* 待测函数中不得假定三个指针的相对偏移量情况 */
#ifndef _RAND_TEST
left[0] = 0xFFFFFFFF;
left[1] = 0xFFFFFFFF;
left[2] = 0xFFFFFFFF;
left[3] = 0xFFFFFFFF;
right[0] = 0xFFFFFFFF;
right[1] = 0xFFFFFFFF;
right[2] = 0xFFFFFFFF;
right[3] = 0xFFFFFFFF;
#else
/* 随机数确保可充满32bits */
# define RAND_VAL() (( (UINT32)(rand()) << 17 ) | rand())
srand(GetTickCount());
left[0] = RAND_VAL();
left[1] = RAND_VAL();
left[2] = RAND_VAL();
left[3] = RAND_VAL();
right[0] = RAND_VAL();
right[1] = RAND_VAL();
right[2] = RAND_VAL();
right[3] = RAND_VAL();
#endif
/* 测试 CPU 支持的指令集 */
__asm
{
mov eax, 1;
cpuid;
mov ecx, 800000h; /* 23 bit */
and ecx, edx;
neg ecx;
sbb ecx, ecx;
neg ecx;
mov dword ptr[ s_bSupport_MMX ], ecx;
mov ecx, 2000000h; /* 25 bit */
and ecx, edx;
neg ecx;
sbb ecx, ecx;
neg ecx;
mov dword ptr[ s_bSupport_SSE ], ecx;
mov ecx, 4000000h; /* 26 bit */
and ecx, edx;
neg ecx;
sbb ecx, ecx;
neg ecx;
mov dword ptr[ s_bSupport_SSE2 ], ecx;
}
QueryPerformanceFrequency((LARGE_INTEGER *)&s_u64Frequency );
}
/************************************************************************/
/* UInt128x128To256 ANSI C 版,经过了严格测试 */
/************************************************************************/
void UInt128x128To256_ANSI_C32( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
typedef union tag_UINT64
{
UINT64 u64Val;
UINT32 u32LH[2];
} UInt64;
UInt64 u64_0x0, u64_0x1, u64_0x2, u64_0x3;
UInt64 u64_1x0, u64_1x1, u64_1x2, u64_1x3;
UInt64 u64_2x0, u64_2x1, u64_2x2, u64_2x3;
UInt64 u64_3x0, u64_3x1, u64_3x2, u64_3x3;
u64_0x0.u64Val = UInt32x32To64( left[0], right[0] );
u64_0x1.u64Val = UInt32x32To64( left[0], right[1] );
u64_0x2.u64Val = UInt32x32To64( left[0], right[2] );
u64_0x3.u64Val = UInt32x32To64( left[0], right[3] );
u64_1x0.u64Val = UInt32x32To64( left[1], right[0] );
u64_1x1.u64Val = UInt32x32To64( left[1], right[1] );
u64_1x2.u64Val = UInt32x32To64( left[1], right[2] );
u64_1x3.u64Val = UInt32x32To64( left[1], right[3] );
u64_2x0.u64Val = UInt32x32To64( left[2], right[0] );
u64_2x1.u64Val = UInt32x32To64( left[2], right[1] );
u64_2x2.u64Val = UInt32x32To64( left[2], right[2] );
u64_2x3.u64Val = UInt32x32To64( left[2], right[3] );
u64_3x0.u64Val = UInt32x32To64( left[3], right[0] );
u64_3x1.u64Val = UInt32x32To64( left[3], right[1] );
u64_3x2.u64Val = UInt32x32To64( left[3], right[2] );
u64_3x3.u64Val = UInt32x32To64( left[3], right[3] );
/* FF FE 00 01 --[0][0]
FF FE 00 01 --[0][1]
FF FE 00 01 --[1][0]
FF FE 00 01 --[0][2]
FF FE 00 01 --[1][1]
FF FE 00 01 --[2][0]
FF FE 00 01 --[0][3]
FF FE 00 01 --[1][2]
FF FE 00 01 --[2][1]
FF FE 00 01 --[3][0]
FF FE 00 01 --[1][3]
FF FE 00 01 --[2][2]
FF FE 00 01 --[3][1]
FF FE 00 01 --[2][3]
FF FE 00 01 --[3][2]
FF FE 00 01 --[3][3] 表示 FFFF FFFE 0000 0001*/
u64_0x1.u64Val += u64_0x0.u32LH[1];
u64_0x1.u64Val += u64_1x0.u64Val;
u64_2x0.u32LH[1] += ( u64_0x1.u64Val < u64_1x0.u64Val );
u64_0x2.u64Val += u64_0x1.u32LH[1];
u64_0x2.u64Val += u64_1x1.u64Val;
u64_3x0.u32LH[1] += ( u64_0x2.u64Val < u64_1x1.u64Val );
u64_0x2.u64Val += u64_2x0.u64Val;
u64_2x1.u32LH[1] += ( u64_0x2.u64Val < u64_2x0.u64Val );
u64_0x3.u64Val += u64_0x2.u32LH[1];
u64_0x3.u64Val += u64_1x2.u64Val;
u64_3x1.u32LH[1] += ( u64_0x3.u64Val < u64_1x2.u64Val );
u64_0x3.u64Val += u64_2x1.u64Val;
u64_2x2.u32LH[1] += ( u64_0x3.u64Val < u64_2x1.u64Val );
u64_0x3.u64Val += u64_3x0.u64Val;
u64_2x3.u64Val += ( u64_0x3.u64Val < u64_3x0.u64Val );
u64_1x3.u64Val += u64_0x3.u32LH[1];
u64_1x3.u64Val += u64_2x2.u64Val;
u64_3x2.u32LH[1] += ( u64_1x3.u64Val < u64_2x2.u64Val );
u64_1x3.u64Val += u64_3x1.u64Val;
u64_3x3.u64Val += ( u64_1x3.u64Val < u64_3x1.u64Val );
u64_2x3.u64Val += u64_1x3.u32LH[1];
u64_2x3.u64Val += u64_3x2.u64Val;
u64_3x3.u32LH[1] += ( u64_2x3.u64Val < u64_3x2.u64Val );
u64_3x3.u64Val += u64_2x3.u32LH[1];
result[0] = u64_0x0.u32LH[0];
result[1] = u64_0x1.u32LH[0];
result[2] = u64_0x2.u32LH[0];
result[3] = u64_0x3.u32LH[0];
result[4] = u64_1x3.u32LH[0];
result[5] = u64_2x3.u32LH[0];
result[6] = u64_3x3.u32LH[0];
result[7] = u64_3x3.u32LH[1];
}
/************************************************************************/
/* 测试函数代码粘贴区开始 begin{ */
/*----------------------------------------------------------------------*/
/* 待测函数命名规范(推荐):UInt128x128To256_{1}_{2}(...) */
/* 其中{1}代表用到的最高级指令集; */
/* {2}代表发帖楼层,如果写错了可以自行修订或请管理员帮助修改 */
/* 以后大家只需发自己待测函数的代码即可,不必再贴全测试代码 */
/************************************************************************/
_declspec(naked)
void UInt128x128To256_MMX_xxxF( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
__asm
{
/* do something ... */
ret;
}
}
_declspec(naked)
void UInt128x128To256_SSE_xxxF( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
__asm
{
/* do something ... */
ret;
}
}
_declspec(naked)
void UInt128x128To256_SSE2_11F( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
__asm
{
/* 具体代码在 11#,需登陆才可见 */
ret;
}
}
/************************************************************************/
/* }end 测试函数代码粘贴区结束 */
/************************************************************************/
void testFun( const lpfn_UInt128x128To256 pFun,
const LPCTSTR lpszFunName,
const UINT32 u32TestTimes )
{
UINT32 i;
printf( "\nTest function: %s(..) %u times... \n",
lpszFunName, u32TestTimes );
QueryPerformanceCounter((LARGE_INTEGER *)&s_u64Start );
for ( i = 0; i < u32TestTimes; ++i )
{
(*pFun)( result, left, right );
}
QueryPerformanceCounter((LARGE_INTEGER *)&s_u64End );
i = (UINT32)(( s_u64End - s_u64Start ) * 1000000UL / s_u64Frequency );
printf( "Elapsed time: %d.%03u ms\n", i / 1000, i % 1000 );
printf( " %08X %08X %08X %08X * %08X %08X %08X %08X\n"
"= %08X %08X %08X %08X %08X %08X %08X %08X\n",
left[3], left[2], left[1], left[0],
right[3], right[2], right[1], right[0],
result[7], result[6], result[5], result[4],
result[3], result[2], result[1], result[0] );
}
int main(int argc, char* argv[])
{
initParam();
/* 标准结果 */
testFun( UInt128x128To256_ANSI_C32,
"UInt128x128To256_ANSI_C32", /*1*/ _TEST_TIMES );
/* MMX 版本测试 */
if ( s_bSupport_MMX )
{
testFun( UInt128x128To256_MMX_xxxF,
"UInt128x128To256_MMX_xxxF", _TEST_TIMES );
/* test other functions:
testFun( ... );
*/
}
/* SSE 版本测试 */
if ( s_bSupport_SSE )
{
testFun( UInt128x128To256_SSE_xxxF,
"UInt128x128To256_SSE_xxxF", _TEST_TIMES );
/* test other functions:
testFun( ... );
*/
}
/* SSE2 版本测试 */
if ( s_bSupport_SSE2 )
{
testFun( UInt128x128To256_SSE2_11F,
"UInt128x128To256_SSE2_11F", _TEST_TIMES );
/* test other functions:
testFun( ... );
*/
}
printf( "\n" );
system( "pause" );
return 0;
}</pre>
<br>完整的测试模板包如下:(包含一个.c文件,及dsp文件)
<br>
<br>请有兴趣的朋友在本地机上编译运行;如发现任何问题,请及时反馈,以便修正,谢谢! |
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有帐号?立即注册
x
|