#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#ifdef WIN32
#include <windows.h>
#else
#include<sys/time.h>
#include<unistd.h>
#endif
double get_time_ms()
{
#ifdef WIN32
return (double)GetTickCount();
#else
struct timeval tv;
struct timezone tz;
gettimeofday (&tv , &tz);
return tv.tv_sec * 1000 + tv.tv_usec / 1000 ;
#endif
}
//这是微软公开的crt src文件夹内的源代码:
void * memcpy_1 (void * dst, void * src , size_t n)
{
void * ret = dst;
while (n--) {
*(char *)dst = *(char *)src;
dst = (char *)dst + 1;
src = (char *)src + 1;
}
return ret;
}
//这是我写的,按照最近流行的优化方式做的
void * memcpy_2 (void * dst, void * src , size_t n)
{
void * ret = dst;
size_t l = n>>2;
while (l--) {
*(size_t *)dst = *(size_t *)src;
dst = (size_t *)dst + 1;
src = (size_t *)src + 1;
}
l = n & 3;
while (l--) {
*(char *)dst = *(char *)src;
dst = (char *)dst + 1;
src = (char *)src + 1;
}
return ret;
}
void main(int argc, char ** argv)
{
const int cstMemSize = 0x2FFFFFF;
if(argc<2)
{
printf("请输入参数 [ 0 | 1 | 2 | 3] ");
return ;
}
char * str = (char*)malloc(cstMemSize);
char * str2 = (char*)malloc(cstMemSize+16);
size_t t1 = get_time_ms();
switch(argv[1][0])
{
case '0': memcpy(str, str2, cstMemSize );
break;
case '1': memcpy_1(str, str2, cstMemSize );
break;
case '2': memcpy_2(str, str2, cstMemSize );
break;
case '3': memcpy_2(str, str2+1, cstMemSize ); //考虑字节不对齐的情况
break;
}
double t2 = get_time_ms();
free(str);
free(str2);
printf("memset_%c %f\n", argv[1][0] ,t2-t1);
}
在windows xp/vc6环境下:
结果发现memcpy_1是最慢的,当然,这是串操作,当然很慢;
但是直接调用memcpy函数是最快的,这大概是微软在vc链接程序时做的优化.
而上面memcpy_2的效果一般般,比memcpy_1快,但是不如直接调用memcpy快.
在gcc/freebsd环境下,如果未加优化参数-O2,case '3'的情况速度更慢.我怀疑可能是字节不对齐带来的恶果.
不过,当开启了-O2后,4个case的测试时间差不多了.但是还是gcc自己的memset函数最快.
我现在怀疑,象(int *)这样的奇技淫巧如果使用不当,带来的效果不明显,还不如不用.现实情况是复杂的,编译器比我考虑的多的多.
bleem1998 回复于:2006-12-28 17:45:22
一般memcpy都是用废编写的呢
你怎么可能快得过它
dulao5 回复于:2006-12-28 17:48:45
估计是人家使用了更优化的指令.
gvim 回复于:2006-12-28 18:25:55
呵呵,下面的代码,UV管道都计算的异常精确,通过判断是否支持sse等高级操作来加快效率。
当然效率极高。
补充一句:这是来自微软的代码,一个朋友给我的,似乎是VS 2007里面的。
page ,132
title memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
; Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
; memcpy() copies a source memory buffer to a destination buffer.
; Overlapping buffers are not treated specially, so propogation may occur.
; memmove() copies a source memory buffer to a destination buffer.
; Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************
.xlist
include cruntime.inc
.list
M_EXIT macro
ret ; _cdecl return
endm ; M_EXIT
CODESEG
extrn _VEC_memcpy:near
extrn __sse2_available:dword
page
;***
;memcpy - Copy source buffer to destination buffer
;
;Purpose:
; memcpy() copies a source memory buffer to a destination memory buffer.
; This routine does NOT recognize overlapping buffers, and thus can lead
; to propogation.
; For cases where propogation must be avoided, memmove() must be used.
;
; Algorithm:
;
; Same as memmove. See Below
;
;
;memmove - Copy source buffer to destination buffer
;
;Purpose:
; memmove() copies a source memory buffer to a destination memory buffer.
; This routine recognize overlapping buffers to avoid propogation.
; For cases where propogation is not a problem, memcpy() can be used.
;
; Algorithm:
;
; void * memmove(void * dst, void * src, size_t count)
; {
; void * ret = dst;
;
; if (dst <= src || dst >= (src + count)) {
; /*
; * Non-Overlapping Buffers
; * copy from lower addresses to higher addresses
; */
; while (count--)
; *dst++ = *src++;
; }
; else {
; /*
; * Overlapping Buffers
; * copy from higher addresses to lower addresses
; */
; dst += count - 1;
; src += count - 1;
;
; while (count--)
; *dst-- = *src--;
; }
;
; return(ret);
; }
;
;
;Entry:
; void *dst = pointer to destination buffer
; const void *src = pointer to source buffer
; size_t count = number of bytes to copy
;
;Exit:
; Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
; CX, DX
;
;Exceptions:
;*******************************************************************************
后续
[ 本帖最后由 gvim 于 2006-12-28 18:34 编辑 ]
gvim 回复于:2006-12-28 18:26:41
接上
ifdef MEM_MOVE
_MEM_ equ <memmove>
else ; MEM_MOVE
_MEM_ equ <memcpy>
endif ; MEM_MOVE
% public _MEM_
_MEM_ proc \
dst:ptr byte, \
src:ptr byte, \
count:IWORD
; destination pointer
; source pointer
; number of bytes to copy
; push ebp ;U - save old frame pointer
; mov ebp, esp ;V - set new frame pointer
push edi ;U - save edi
push esi ;V - save esi
mov esi,[src] ;U - esi = source
mov ecx,[count] ;V - ecx = number of bytes to move
mov edi,[dst] ;U - edi = dest
;
; Check for overlapping buffers:
; If (dst <= src) Or (dst >= src + Count) Then
; Do normal (Upwards) Copy
; Else
; Do Downwards Copy to avoid propagation
;
mov eax,ecx ;V - eax = byte count...
mov edx,ecx ;U - edx = byte count...
add eax,esi ;V - eax = point past source end
cmp edi,esi ;U - dst <= src ?
jbe short CopyUp ;V - yes, copy toward higher addresses
cmp edi,eax ;U - dst < (src + count) ?
jb CopyDown ;V - yes, copy toward lower addresses
;
; Copy toward higher addresses.
;
CopyUp:
;
; First, see if we can use a "fast" copy SSE2 routine
; block size greater than min threshold?
cmp ecx,0100h
jb Dword_align
; SSE2 supported?
cmp DWORD PTR __sse2_available,0
je Dword_align
; alignments equal?
push edi
push esi
and edi,15
and esi,15
cmp edi,esi
pop esi
pop edi
jne Dword_align
; do fast SSE2 copy, params already on stack
pop esi
pop edi
pop ebp
jmp _VEC_memcpy
; no return
;
; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination. This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;
Dword_align:
test edi,11b ;U - destination dword aligned?
jnz short CopyLeadUp ;V - if we are not dword aligned already, align
shr ecx,2 ;U - shift down to dword count
and edx,11b ;V - trailing byte count
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
;
; Code to do optimal memory copies for non-dword-aligned destinations.
;
; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
; alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
; be faster to move the bytes with one instruction.
;
align @WordSize
CopyLeadUp:
mov eax,edi ;U - get destination offset
mov edx,11b ;V - prepare for mask
sub ecx,4 ;U - check for really short string - sub for adjust
jb short ByteCopyUp ;V - branch to just copy bytes
and eax,11b ;U - get offset within first dword
add ecx,eax ;V - update size after leading bytes copied
jmp dword ptr LeadUpVec[eax*4-4] ;N - process leading bytes
align @WordSize
ByteCopyUp:
jmp dword ptr TrailUpVec[ecx*4+16] ;N - process just bytes
align @WordSize
CopyUnwindUp:
jmp dword ptr UnwindUpVec[ecx*4] ;N - unwind dword copy
align @WordSize
LeadUpVec dd LeadUp1, LeadUp2, LeadUp3
align @WordSize
LeadUp1:
and edx,ecx ;U - trailing byte count
mov al,[esi] ;V - get first byte from source
mov [edi],al ;U - write second byte to destination
mov al,[esi+1] ;V - get second byte from source
mov [edi+1],al ;U - write second byte to destination
mov al,[esi+2] ;V - get third byte from source
shr ecx,2 ;U - shift down to dword count
mov [edi+2],al ;V - write third byte to destination
add esi,3 ;U - advance source pointer
add edi,3 ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
align @WordSize
LeadUp2:
and edx,ecx ;U - trailing byte count
mov al,[esi] ;V - get first byte from source
mov [edi],al ;U - write second byte to destination
mov al,[esi+1] ;V - get second byte from source
shr ecx,2 ;U - shift down to dword count
mov [edi+1],al ;V - write second byte to destination
add esi,2 ;U - advance source pointer
add edi,2 ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
align @WordSize
LeadUp3:
and edx,ecx ;U - trailing byte count
mov al,[esi] ;V - get first byte from source
mov [edi],al ;U - write second byte to destination
add esi,1 ;V - advance source pointer
shr ecx,2 ;U - shift down to dword count
add edi,1 ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
align @WordSize
UnwindUpVec dd UnwindUp0, UnwindUp1, UnwindUp2, UnwindUp3
dd UnwindUp4, UnwindUp5, UnwindUp6, UnwindUp7
UnwindUp7:
mov eax,[esi+ecx*4-28] ;U - get dword from source
;V - spare
mov [edi+ecx*4-28],eax ;U - put dword into destination
UnwindUp6:
mov eax,[esi+ecx*4-24] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-24],eax ;U - put dword into destination
UnwindUp5:
mov eax,[esi+ecx*4-20] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-20],eax ;U - put dword into destination
UnwindUp4:
mov eax,[esi+ecx*4-16] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-16],eax ;U - put dword into destination
UnwindUp3:
mov eax,[esi+ecx*4-12] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-12],eax ;U - put dword into destination
UnwindUp2:
mov eax,[esi+ecx*4-8] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-8],eax ;U - put dword into destination
UnwindUp1:
mov eax,[esi+ecx*4-4] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4-4],eax ;U - put dword into destination
lea eax,[ecx*4] ;V - compute update for pointer
add esi,eax ;U - update source pointer
add edi,eax ;V - update destination pointer
UnwindUp0:
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
;-----------------------------------------------------------------------------
align @WordSize
TrailUpVec dd TrailUp0, TrailUp1, TrailUp2, TrailUp3
align @WordSize
TrailUp0:
mov eax,[dst] ;U - return pointer to destination
pop esi ;V - restore esi
pop edi ;U - restore edi
;V - spare
M_EXIT
align @WordSize
TrailUp1:
mov al,[esi] ;U - get byte from source
;V - spare
mov [edi],al ;U - put byte in destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailUp2:
mov al,[esi] ;U - get first byte from source
;V - spare
mov [edi],al ;U - put first byte into destination
mov al,[esi+1] ;V - get second byte from source
mov [edi+1],al ;U - put second byte into destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailUp3:
mov al,[esi] ;U - get first byte from source
;V - spare
mov [edi],al ;U - put first byte into destination
mov al,[esi+1] ;V - get second byte from source
mov [edi+1],al ;U - put second byte into destination
mov al,[esi+2] ;V - get third byte from source
mov [edi+2],al ;U - put third byte into destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;
; Copy down to avoid propogation in overlapping buffers.
;
align @WordSize
CopyDown:
lea esi,[esi+ecx-4] ;U - point to 4 bytes before src buffer end
lea edi,[edi+ecx-4] ;V - point to 4 bytes before dest buffer end
;
; See if the destination start is dword aligned
;
test edi,11b ;U - test if dword aligned
jnz short CopyLeadDown ;V - if not, jump
shr ecx,2 ;U - shift down to dword count
and edx,11b ;V - trailing byte count
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag back
jmp dword ptr TrailDownVec[edx*4] ;N - process trailing bytes
align @WordSize
CopyUnwindDown:
neg ecx ;U - negate dword count for table merging
;V - spare
jmp dword ptr UnwindDownVec[ecx*4+28] ;N - unwind copy
align @WordSize
CopyLeadDown:
mov eax,edi ;U - get destination offset
mov edx,11b ;V - prepare for mask
cmp ecx,4 ;U - check for really short string
jb short ByteCopyDown ;V - branch to just copy bytes
and eax,11b ;U - get offset within first dword
sub ecx,eax ;U - to update size after lead copied
jmp dword ptr LeadDownVec[eax*4-4] ;N - process leading bytes
align @WordSize
ByteCopyDown:
jmp dword ptr TrailDownVec[ecx*4] ;N - process just bytes
align @WordSize
LeadDownVec dd LeadDown1, LeadDown2, LeadDown3
align @WordSize
LeadDown1:
mov al,[esi+3] ;U - load first byte
and edx,ecx ;V - trailing byte count
mov [edi+3],al ;U - write out first byte
sub esi,1 ;V - point to last src dword
shr ecx,2 ;U - shift down to dword count
sub edi,1 ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec[edx*4] ;N - process trailing bytes
align @WordSize
LeadDown2:
mov al,[esi+3] ;U - load first byte
and edx,ecx ;V - trailing byte count
mov [edi+3],al ;U - write out first byte
mov al,[esi+2] ;V - get second byte from source
shr ecx,2 ;U - shift down to dword count
mov [edi+2],al ;V - write second byte to destination
sub esi,2 ;U - point to last src dword
sub edi,2 ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec[edx*4] ;N - process trailing bytes
align @WordSize
LeadDown3:
mov al,[esi+3] ;U - load first byte
and edx,ecx ;V - trailing byte count
mov [edi+3],al ;U - write out first byte
mov al,[esi+2] ;V - get second byte from source
mov [edi+2],al ;U - write second byte to destination
mov al,[esi+1] ;V - get third byte from source
shr ecx,2 ;U - shift down to dword count
mov [edi+1],al ;V - write third byte to destination
sub esi,3 ;U - point to last src dword
sub edi,3 ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec[edx*4] ;N - process trailing bytes
;------------------------------------------------------------------
align @WordSize
UnwindDownVec dd UnwindDown7, UnwindDown6, UnwindDown5, UnwindDown4
dd UnwindDown3, UnwindDown2, UnwindDown1, UnwindDown0
UnwindDown7:
mov eax,[esi+ecx*4+28] ;U - get dword from source
;V - spare
mov [edi+ecx*4+28],eax ;U - put dword into destination
UnwindDown6:
mov eax,[esi+ecx*4+24] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+24],eax ;U - put dword into destination
UnwindDown5:
mov eax,[esi+ecx*4+20] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+20],eax ;U - put dword into destination
UnwindDown4:
mov eax,[esi+ecx*4+16] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+16],eax ;U - put dword into destination
UnwindDown3:
mov eax,[esi+ecx*4+12] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+12],eax ;U - put dword into destination
UnwindDown2:
mov eax,[esi+ecx*4+8] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+8],eax ;U - put dword into destination
UnwindDown1:
mov eax,[esi+ecx*4+4] ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov [edi+ecx*4+4],eax ;U - put dword into destination
lea eax,[ecx*4] ;V - compute update for pointer
add esi,eax ;U - update source pointer
add edi,eax ;V - update destination pointer
UnwindDown0:
jmp dword ptr TrailDownVec[edx*4] ;N - process trailing bytes
;-----------------------------------------------------------------------------
align @WordSize
TrailDownVec dd TrailDown0, TrailDown1, TrailDown2, TrailDown3
align @WordSize
TrailDown0:
mov eax,[dst] ;U - return pointer to destination
;V - spare
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown1:
mov al,[esi+3] ;U - get byte from source
;V - spare
mov [edi+3],al ;U - put byte in destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown2:
mov al,[esi+3] ;U - get first byte from source
;V - spare
mov [edi+3],al ;U - put first byte into destination
mov al,[esi+2] ;V - get second byte from source
mov [edi+2],al ;U - put second byte into destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown3:
mov al,[esi+3] ;U - get first byte from source
;V - spare
mov [edi+3],al ;U - put first byte into destination
mov al,[esi+2] ;V - get second byte from source
mov [edi+2],al ;U - put second byte into destination
mov al,[esi+1] ;V - get third byte from source
mov [edi+1],al ;U - put third byte into destination
mov eax,[dst] ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
_MEM_ endp
end
langue 回复于:2006-12-28 18:32:58
/* $NetBSD: bcopy.c,v 1.6 1997/07/13 20:24:12 christos Exp $ */
...
/*
* Copy a block of memory, handling overlap.
* This is the routine that actually implements
* (the portable versions of) bcopy, memcpy, and memmove.
*/
#ifdef MEMCOPY
void *
memcpy(dst0, src0, length)
#else
#ifdef MEMMOVE
void *
memmove(dst0, src0, length)
#else
void
bcopy(src0, dst0, length)
#endif
#endif
void *dst0;
const void *src0;
register size_t length;
{
register char *dst = dst0;
register const char *src = src0;
register size_t t;
if (length == 0 || dst == src) /* nothing to do */
goto done;
/*
* Macros: loop-t-times; and loop-t-times, t>0
*/
#define TLOOP(s) if (t) TLOOP1(s)
#define TLOOP1(s) do { s; } while (--t)
if ((unsigned long)dst < (unsigned long)src) {
/*
* Copy forward.
*/
t = (long)src; /* only need low bits */
if ((t | (long)dst) & wmask) {
/*
* Try to align operands. This cannot be done
* unless the low bits match.
*/
if ((t ^ (long)dst) & wmask || length < wsize)
t = length;
else
t = wsize - (t & wmask);
length -= t;
TLOOP1(*dst++ = *src++);
}
/*
* Copy whole words, then mop up any trailing bytes.
*/
t = length / wsize;
TLOOP(*(word *)dst = *(word *)src; src += wsize; dst += wsize);
t = length & wmask;
TLOOP(*dst++ = *src++);
} else {
/*
* Copy backwards. Otherwise essentially the same.
* Alignment works as before, except that it takes
* (t&wmask) bytes to align, not wsize-(t&wmask).
*/
src += length;
dst += length;
t = (long)src;
if ((t | (long)dst) & wmask) {
if ((t ^ (long)dst) & wmask || length <= wsize)
t = length;
else
t &= wmask;
length -= t;
TLOOP1(*--dst = *--src);
}
t = length / wsize;
TLOOP(src -= wsize; dst -= wsize; *(word *)dst = *(word *)src);
t = length & wmask;
TLOOP(*--dst = *--src);
}
done:
#if defined(MEMCOPY) || defined(MEMMOVE)
return (dst0);
#else
return;
#endif
gvim 回复于:2006-12-28 18:37:31
引用:原帖由 langue 于 2006-12-28 18:32 发表
/* $NetBSD: bcopy.c,v 1.6 1997/07/13 20:24:12 christos Exp $ */
...
/*
* Copy a block of memory, handling overlap.
* This is the routine that actually implements
* (the por ...
NetBSD的代码注重跨平台,首先需要在他所支持的平台上都能编译,其次才是效率。
MS的代码注重效率,只需要在i386上,自然可以强力优化,比如优化UV流水线、借助SSE指令等,移植性为0
两种设计,很有意思。:D
gvim 回复于:2006-12-28 19:17:03
http://blog.codingnow.com/2005/10/vc_memcpy.html
mik 回复于:2006-12-28 22:36:20
这个算法用C写,不知编译器会怎样生成代码,倒不如直接用汇编
不考虑其它条件的话:
#define do_mem_copy(dest, src, size) \
__asm__ __volatile__ ("movl %0, %%edi; movl %1, %%esi; \
movl %2, %%ecx; movl %%ecx, %%eax; \
shr $2, %%ecx; rep movsl; \
movl %%eax, %%ecx; andl $3, %%ecx; \
rep movsb" \
: \
: "r" (dest), "r" (src), "r" (size) \
);
再看看 sun 在 Solaris 的实现:
ENTRY(memcpy)
movl %edi,%edx / save register variables
pushl %esi
movl 8(%esp),%edi / %edi = dest address
movl 12(%esp),%esi / %esi = source address
movl 16(%esp),%ecx / %ecx = length of string
movl %edi,%eax / return value from the call
shrl $2,%ecx / %ecx = number of words to move
rep ; smovl / move the words
movl 16(%esp),%ecx / %ecx = number of bytes to move
andl $0x3,%ecx / %ecx = number of bytes left to move
rep ; smovb / move the bytes
popl %esi / restore register variables
movl %edx,%edi
ret
SET_SIZE(memcpy)
nully 回复于:2006-12-28 23:26:17
引用:原帖由 gvim 于 2006-12-28 18:37 发表
NetBSD的代码注重跨平台,首先需要在他所支持的平台上都能编译,其次才是效率。
MS的代码注重效率,只需要在i386上,自然可以强力优化,比如优化UV流水线、借助SSE指令等,移植性为0
两种设计,很有意思。:D
确实是有趣的问题。
不过NetBSD也可以这么做,首先调查在哪些平台上用得比较多,然后针对这些平台上:
#ifdef _I686_
...
使用UV流水线
#ifdef _MMX_
使用mmx...
#ifdef _SSE_
使用sse...
...
不过代码就太不美观了。。。不可取,呵呵
gvim 回复于:2006-12-28 23:36:35
引用:原帖由 nully 于 2006-12-28 23:26 发表
确实是有趣的问题。
不过NetBSD也可以这么做,首先调查在哪些平台上用得比较多,然后针对这些平台上:
#ifdef _I686_
...
使用UV流水线
#ifdef _MMX_
使用mmx...
#ifdef _SSE_
使用sse...
...
...
美观不是主要因素,虽然人爱美之心人皆有之
NetBSD老婆太多,需要照顾的太多,要公平 :)
现在的做法,1套代码可以用到N个机器上。也就是1:N。
如果为每个平台都进行优化,那就需要N套代码,就是N:N。
如果在每个平台上作不同优化,比如i386, i686, uv, sse,那么就是M:N (M>N)
而memcpy这样的操作语义是操作系统提供的,而不是底层平台,因此封装到平台相关层面也不合适。同时,对于一个以跨平台为设计目标的系统来说,这简直就是灾难。
韦小宝对付老婆也只用了一招:耍赖 :mrgreen:
MS就不一样,只有一个老婆,就可以专人专项了。
[ 本帖最后由 gvim 于 2006-12-28 23:38 编辑 ]
nully 回复于:2006-12-28 23:41:03
引用:原帖由 gvim 于 2006-12-28 23:36 发表
美观不是主要因素,虽然人爱美之心人皆有之
NetBSD老婆太多,需要照顾的太多,要公平 :)
现在的做法,1套代码可以用到N个机器上。也就是1:N。
如果为每个平台都进行优化,那就需要N套代码,就是N:N。
...
偶这刚说了嘛,只对睡得比较多的大奶进行优化。。。二三四奶的先不急。。。
至于公不公平的问题,各个老婆长相不一样,三围也不一样,老公要公平,从何谈起。。。。
gvim 回复于:2006-12-28 23:43:29
引用:原帖由 nully 于 2006-12-28 23:41 发表
偶这刚说了嘛,只对睡得比较多的大奶进行优化。。。二三四奶的先不急。。。
至于公不公平的问题,各个老婆长相不一样,三围也不一样,老公要公平,从何谈起。。。。
大奶:em06::em06::em06: 我喜欢高树玛丽亚,长得不错,可惜不大:em10: 跑题了,呵呵,打住 :em11:
namtso 回复于:2006-12-28 23:46:24
很多书上都说,像memcpy这种通用函数,对应平台上的库“有可能提供优化”代码。
只有在做嵌入式等特殊程序的时候,由于在对应平台上库资源比较缺乏,“没有提供优化代码”的可能性非常大,所以才需要自己去做优化。
nully 回复于:2006-12-28 23:56:29
程序要保持苗条,切记切记。。。
dulao5 回复于:2006-12-29 09:05:32
呵呵,这回真是抛砖引玉了, 引出的玉非常多,我仔细学习下.
星尘细雨 回复于:2006-12-29 10:18:58
使用了mmx/sse/sse2的汇编来写memcpy肯定是更快的,
就是在非x86的CPU移植麻烦,
还有我记得性能还和cpu有关,
Intel的cpu在执行sse2指令比amd的cpu的sse2指令快。
cjaizss 回复于:2006-12-29 10:34:04
CPU一般都专门对字符串处理做了支持,使得比你这样写效率要高。你用C语言写怎么可能快的过纯C语言所无法表示的汇编语句所表达的意思?
高峰 回复于:2006-12-29 10:50:42
崇拜汇编的牛人!
cjaizss 回复于:2006-12-29 11:00:40
引用:原帖由 高峰 于 2006-12-29 10:50 发表
崇拜汇编的牛人!
汇编,只是一个工具,崇拜是因为你碰的少,碰多了也就那么回事
思一克 回复于:2006-12-29 11:03:07
INTEL 上的REPZ MOVSB
快。memcpy等就可能优化成了这些。
用C写的慢。4个字节一起做也快不了多少
dulao5 回复于:2006-12-29 11:04:58
嗯,看起来的确是这样, MOVSB指令就是为了移动串而生的,用它的确应该快.
mik 回复于:2006-12-30 00:12:24
引用:原帖由 星尘细雨 于 2006-12-29 10:18 发表
使用了mmx/sse/sse2的汇编来写memcpy肯定是更快的,
就是在非x86的CPU移植麻烦,
还有我记得性能还和cpu有关,
Intel的cpu在执行sse2指令比amd的cpu的sse2指令快。
既然提到 MMX 指令,就贴一段使用MMX指令copy内存块的代码:
// block copy: copy a number of DWORDs from DWORD aligned source
// to DWORD aligned destination using cacheable stores.
__asm {
MOV ESI, [src_ptr] ;pointer to src, DWORD aligned
MOV EDI, [dst_ptr] ;pointer to dst, DWORD aligned
MOV ECX, [blk_size] ;number of DWORDs to copy
PREFETCH (ESI) ;PREFetch first src cache line
CMP ECX, 1 ;less than one DWORD to copy ?
JB $copydone2_cc ;yes, must be no DWORDs to copy, done
TEST EDI, 7 ;dst QWORD aligned?
JZ $dstqaligned2_cc ;yes
MOVD MM0, [ESI] ;read one DWORD from src
MOVD [EDI], MM0 ;store one DWORD to dst
ADD ESI, 4 ;src++
ADD EDI, 4 ;dst++
DEC ECX ;number of DWORDs to copy
$dstqaligned2_cc:
MOV EBX, ECX ;number of DWORDs to copy
SHR ECX, 4 ;number of cache lines to copy
JZ $copyqwords2_cc ;no whole cache lines to copy, maybe QWORDs
PREFetchm (ESI,64) ;PREFetch src cache line one ahead
PREFetchmlong (ESI,128) ;PREFetch src cache line two ahead
ALIGN 16 ;align loop for optimal performance
$cloop2_cc:
PREFetchmlong (ESI, 192) ;prefetch cache line three ahead
MOVQ MM0, [ESI] ;load first QWORD in cache line from src
ADD EDI, 64 ;src++
MOVQ MM1, [ESI+8] ;load second QWORD in cache line from src
ADD ESI, 64 ;dst++
MOVQ MM2, [ESI-48] ;load third QWORD in cache line from src
MOVQ [EDI-64], MM0 ;store first DWORD in cache line to dst
MOVQ MM0, [ESI-40] ;load fourth QWORD in cache line from src
MOVQ [EDI-56], MM1 ;store second DWORD in cache line to dst
MOVQ MM1, [ESI-32] ;load fifth QWORD in cache line from src
MOVQ [EDI-48], MM2 ;store third DWORD in cache line to dst
MOVQ MM2, [ESI-24] ;load sixth QWORD in cache line from src
MOVQ [EDI-40], MM0 ;store fourth DWORD in cache line to dst
MOVQ MM0, [ESI-16] ;load seventh QWORD in cache line from src
MOVQ [EDI-32], MM1 ;store fifth DWORD in cache line to dst
MOVQ MM1, [ESI-8] ;load eight QWORD in cache line from src
MOVQ [EDI-24], MM2 ;store sixth DWORD in cache line to dst
MOVQ [EDI-16], MM0 ;store seventh DWORD in cache line to dst
DEC ECX ;count--
MOVQ [EDI-8], MM1 ;store eighth DWORD in cache line to dst
JNZ $cloop2_cc ;until no more cache lines to copy
$copyqwords2_cc:
MOV ECX, EBX ;number of DWORDs to copy
AND EBX, 0xE ;number of QWORDS left to copy * 2
JZ $copydword2_cc ;no QWORDs left, maybe DWORD left
ALIGN 16 ;align loop for optimal performance
$qloop2_cc:
MOVQ MM0, [ESI] ;read QWORD from src
MOVQ [EDI], MM0 ;store QWORD to dst
ADD ESI, 8 ;src++
ADD EDI, 8 ;dst++
SUB EBX, 2 ;count--
JNZ $qloop2_cc ;until no more QWORDs left to copy
$copydword2_cc:
TEST ECX, 1 ;DWORD left to copy ?
JZ $copydone2_cc ;nope, we’re done
MOVD MM0, [ESI] ;read last DWORD from src
MOVD [EDI], MM0 ;store last DWORD to dst
$copydone2_cc:
FEMMS ;clear MMX state
}
当复制块大于 512 字节时,使用以上代码才有价值
另一个更强悍的手法是使用: 128 位的 xmm 指令
movdqa xmm0, [rdx+r8*8] ; Load
movntdq [rcx+r8*8], xmm0 ; Store
movdqa xmm1, [rdx+r8*8+16] ; Load
movntdq [rcx+r8*8+16], xmm1 ; Store
mingyanguo 回复于:2006-12-30 08:57:56
这种效率很诡异,我认为这些优化中,真值得去做的,就是把byte复制转化成word复制,这是跨平台的方法,而且绝大多数时候都能获得大幅度的效率提升。有时候,asm未必就比C快。
我测试的结果是,哪怕是asm优化过的,在拷贝的大小不同时效率也是不同的,很难说哪个更好。
另外,ia32上的串操作指令未必快,而mmx/sse这些指令又不通用。
代码上的优化还是适可而止的好,过度优化的代码没有生命力。
cobras 回复于:2006-12-30 20:29:03
不知使用汇编代码如何?
mov di,dest
mov si,src
mov cx,size
cld
rep movsd
不考虑重叠情况
obrire 回复于:2006-12-30 20:59:53
以前有Intel提供了相关优化的讲义,其间是讲到了C有些地方的写法,有很大的讲究
至于WimmX优化指令等,还是做浮点,积分很牛的哟。内存拷贝对系统而言,是可
以优化的,优化可以针对块拷贝,如512对齐等,具体要看应用了,有些需要大块对
齐,有些是线性的,如果系统允许,地接内存重映射就可以了。具体实现,还与OS的
内存管理有关。
如需大块线性空间,但用户空间已经不可能申请太多时,这时内存管理算法比拷贝可
能更耗时间了。当然,最快也无非DMA,锁存,传输,解锁,且最后换算出来,一定
是线性最好。你想想,如果真要while什么的,一定会慢下来。以后计算机最好用硬
件设计典型的加速器,且有多个适应逻辑,支持4K/BYTE/4M等单位数据迁移。
如果你需要4MB,直接在最短的1个周期中映射过去,力争在10个周期内处理完数据
(POST)。
唉,这些应用与实际要求相结合最好。至于memcpy,一般而言,就用libc的就可以了。
要求同步数据或其它高级应用,当然直接是你管理内存最好。不是以后有高带内存与
NOR一样了吗?可读,可写,可保存,设计一块区域专用于内存拷贝,还可异步完成。
双核心,系统也这样设计,一定是很高效了。
SEND DATA -> EXT LOCK
SEND MSG -> DATA READY ->READ EXT LOCK DATA->NEXT OP
NEXT OP
扯得太远了。
|