[Mono-devel-list] [Patch] Manged code is fast!
Paolo Molaro
lupus at ximian.com
Fri May 21 08:29:04 EDT 2004
On 05/21/04 Andreas Nahr wrote:
> > > private unsafe static void CharCopy (char* source, char* destination,
> int count)
> >
> > What is the perf here if things are not dword aligned?
>
> I think for me thing always were dword aligned. We should ensure that
> Strings always get the right alignment in the JIT.
We can guarantee the character data in a string will be aligned to a 4 byte
boundary, but CharCopy can called on data aligned to just 2.
> > > + while (count >= 16) {
> > > + *((int*) destination) = *((int*) source);
> > > + destination += 2;
> > > + source += 2;
> > > + *((int*) destination) = *((int*) source);
> > > + destination += 2;
> > > + source += 2;
> >
> > It is probably better to do something like:
> >
> > *((int*) dest + x) = ...
>
> Did you really test this or are you just guessing?
What? It's much easier to talk than to test! Why should he test? :-)
> For me the above solution (although more source code) always produced
> superior speed.
> However I used the notation *((int*) dest[x]) =...
> But this seems to be compiled into same IL.
When you posted about the low performance and I changed the JIT to
produce faster code I also investigated a few methods in String and
methods to do copies. The basic thing to note is to keep the variables
used in the inner loop to 3 and to do clever unrolling. When unrolling
in a copy, for example you should not do:
copy 1
increase pointers by 1
copy 1
increase pointers by 1
...
but the more efficient:
copy 1
copy 1
copy 1
copy 1
increase pointers by 4
See the attached benchmarks for ideas: GetHashCode() is always faster
than the C version (on x86, on ppc it's faster until 200 chars and 20%
slower at 1000, but I didn't optimize that yet). It's twice as fast
as the current code so I'll get it in cvs in the next few days.
As for copies: I'd like to have something like the attached memcpy in
System.String and use it whenever a copy is required (it will eventually
be used also for the cpblk IL opcode). The memcpy is always faster than
the C version for me (except when the data is misaligned): I didn't have
the time to properly test if this is because of bugs in the code:-)
If someone would write a set of extensive tests for memcpy it'll be
appreciated.
Results from both benchmarks on different cpus are also appreciated:
please provide cpu type and speed and run with -O=all with mono from
cvs (-O=loop is enough to get most of the speed: I'll enable it by
default shortly since it has low impact on JIT time).
A memmove method is also needed for some of the string methods.
Thanks.
lupus
--
-----------------------------------------------------------------
lupus at debian.org debian/rules
lupus at ximian.com Monkeys do it better
-------------- next part --------------
using System;
class T {
static unsafe int Hash (string s) {
//if (s.Length >= 1000)
// return s.GetHashCode ();
fixed (char * c = s) {
char * cc = c;
char * end = cc + s.Length - 1;
int h = 0;
for (;cc < end; cc += 2) {
h = (h << 5) - h + *cc;
h = (h << 5) - h + cc [1];
}
++end;
if (cc < end)
h = (h << 5) - h + *cc;
return h;
}
}
static unsafe int Hash2 (string s) {
//if (s.Length >= 1000)
// return s.GetHashCode ();
fixed (char * c = s) {
char * cc = c;
char * end = cc + s.Length - 4;
int h = 0;
for (;cc < end; cc += 4) {
h = (h << 5) - h + *cc;
h = (h << 5) - h + cc [1];
h = (h << 5) - h + cc [2];
h = (h << 5) - h + cc [3];
}
end += 4;
while (cc < end)
h = (h << 5) - h + *cc++;
return h;
}
}
const int count = 1000000;
static int test_new (int size, bool print) {
string s = new String ('a', size);
int start, end, i, v = 0;
start = Environment.TickCount;
for (i = 0; i < count; ++i) {
v = Hash2 (s);
}
end = Environment.TickCount;
if (print)
Console.Write ("{0}\t{1}\t", size, end-start);
return v;
}
static int test_old (int size, bool print) {
string s = new String ('a', size);
int start, end, i, v = 0;
start = Environment.TickCount;
for (i = 0; i < count; ++i) {
v = s.GetHashCode ();
}
end = Environment.TickCount;
if (print)
Console.WriteLine (end-start);
return v;
}
static void test (int size) {
int v1 = test_new (size, true);
int v2 = test_old (size, true);
if (v1 != v2)
Console.WriteLine ("failed compare");
}
static void Main () {
int size;
for (size = 0; size < 500; ) {
test (size);
if (size < 5)
size += 1;
else if (size < 50)
size += 5;
else if (size < 150)
size += 11;
else
size += 21;
}
// degenerate cases: we should always cache the hash in these cases
test (1000);
test (10000);
}
}
-------------- next part --------------
using System;
class T {
static unsafe void memcpy4 (byte *dest, byte *src, int size) {
while (size >= 32) {
// using long is better than int and slower than double
// FIXME: enable this only on correct alignment or on platforms
// that can tolerate unaligned reads/writes of doubles
((double*)dest) [0] = ((double*)src) [0];
((double*)dest) [1] = ((double*)src) [1];
((double*)dest) [2] = ((double*)src) [2];
((double*)dest) [3] = ((double*)src) [3];
dest += 32;
src += 32;
size -= 32;
}
while (size >= 16) {
((int*)dest) [0] = ((int*)src) [0];
((int*)dest) [1] = ((int*)src) [1];
((int*)dest) [2] = ((int*)src) [2];
((int*)dest) [3] = ((int*)src) [3];
dest += 16;
src += 16;
size -= 16;
}
while (size >= 4) {
((int*)dest) [0] = ((int*)src) [0];
dest += 4;
src += 4;
size -= 4;
}
while (size > 0) {
((byte*)dest) [0] = ((byte*)src) [0];
dest += 1;
src += 1;
--size;
}
}
static unsafe void memcpy2 (byte *dest, byte *src, int size) {
while (size >= 8) {
((short*)dest) [0] = ((short*)src) [0];
((short*)dest) [1] = ((short*)src) [1];
((short*)dest) [2] = ((short*)src) [2];
((short*)dest) [3] = ((short*)src) [3];
dest += 8;
src += 8;
size -= 8;
}
while (size >= 2) {
((short*)dest) [0] = ((short*)src) [0];
dest += 2;
src += 2;
size -= 2;
}
if (size > 0)
((byte*)dest) [0] = ((byte*)src) [0];
}
static unsafe void memcpy1 (byte *dest, byte *src, int size) {
while (size >= 8) {
((byte*)dest) [0] = ((byte*)src) [0];
((byte*)dest) [1] = ((byte*)src) [1];
((byte*)dest) [2] = ((byte*)src) [2];
((byte*)dest) [3] = ((byte*)src) [3];
((byte*)dest) [4] = ((byte*)src) [4];
((byte*)dest) [5] = ((byte*)src) [5];
((byte*)dest) [6] = ((byte*)src) [6];
((byte*)dest) [7] = ((byte*)src) [7];
dest += 8;
src += 8;
size -= 8;
}
while (size >= 2) {
((byte*)dest) [0] = ((byte*)src) [0];
((byte*)dest) [1] = ((byte*)src) [1];
dest += 2;
src += 2;
size -= 2;
}
if (size > 0)
((byte*)dest) [0] = ((byte*)src) [0];
}
static unsafe void memcpy (byte *dest, byte *src, int size) {
// FIXME: if pointers are not aligned, try to align them
// so a faster routine can be used. Handle the case where
// the pointers can't be reduced to have the same alignment
// (just ignore the issue on x86?)
if (((int)dest & 3) == 0 && ((int)src & 3) == 0) {
memcpy4 (dest, src, size);
} else if (((int)dest & 1) == 0 && ((int)src & 1) == 0) {
if (size >= 6) {
dest [0] = src [0];
dest [1] = src [1];
memcpy4 (dest + 2, src + 2, size - 2);
} else {
memcpy2 (dest, src, size);
}
} else {
if (size >= 6) {
dest [0] = src [0];
++dest;
++src;
--size;
if (((int)dest & 3) == 0 && ((int)src & 3) == 0) {
memcpy4 (dest, src, size);
} else if (((int)dest & 1) == 0 && ((int)src & 1) == 0) {
if (size >= 6) {
dest [0] = src [0];
dest [1] = src [1];
memcpy4 (dest + 2, src + 2, size - 2);
} else {
memcpy2 (dest, src, size);
}
} else {
memcpy1 (dest, src, size);
}
} else {
memcpy1 (dest, src, size);
}
}
}
static unsafe void BlockCopy (Array src, int srcOffset, Array dest, int destOffset, int count) {
if (srcOffset < 0)
throw new ArgumentOutOfRangeException ("srcOffset", "non-neg");
if (destOffset < 0)
throw new ArgumentOutOfRangeException ("destOffset", "non-neg");
if (count < 0)
throw new ArgumentOutOfRangeException ("count", "non-neg");
// must check otherwise the fixed expression throws out-of-range exception
if (count == 0)
return;
// need icall to simulate this and use ByteLength instead of Length
byte[] s = (byte[])src;
byte[] d = (byte[])dest;
if (srcOffset + count > s.Length || destOffset + count > d.Length)
throw new ArgumentException ("out of bounds");
fixed (byte *A = s, B = d) {
memcpy (B + destOffset, A + srcOffset, count);
}
}
const int count = 2000000;
static void run (int sz, int ofs, byte[] src, byte[] dst) {
int start, end;
start = Environment.TickCount;
for (int i = 0; i < count; i++) {
System.Buffer.BlockCopy (src, ofs, dst, ofs, sz - ofs);
}
end = Environment.TickCount;
Console.Write ("{0}: {1} ", sz, end-start);
}
static unsafe void run1 (int sz, int ofs, byte[] src, byte[] dst) {
int start, end;
start = Environment.TickCount;
for (int i = 0; i < count; i++) {
BlockCopy (src, ofs, dst, ofs, sz - ofs);
}
end = Environment.TickCount;
Console.Write (" {0} ", end-start);
}
static void run2 (int sz, int ofs, byte[] src, byte[] dst) {
int start, end;
start = Environment.TickCount;
for (int i = 0; i < count; i++) {
System.Array.Copy (src, ofs, dst, ofs, sz - ofs);
}
end = Environment.TickCount;
Console.WriteLine (end-start);
}
static void Main (string[] args) {
int i, ofs;
i = Environment.TickCount;
byte [] src;
byte [] dst;
ofs = 0;
if (args.Length > 0)
ofs = int.Parse (args [0]);
Console.WriteLine ("Size: Buffer.bcopy memcpy Array.Copy (align: {0})", ofs);
for (i = 1; i <= 10000;) {
src = new byte [i];
dst = new byte [i];
if (i >= ofs) {
run (i, ofs, src, dst);
run1 (i, ofs, src, dst);
run2 (i, ofs, src, dst);
}
if (i < 5)
++i;
else if (i < 50)
i += 5;
else if (i < 100)
i += 10;
else if (i < 1000)
i += 100;
else
i += 1000;
}
src = new byte [100000];
dst = new byte [100000];
run (100000, ofs, src, dst);
run1 (100000, ofs, src, dst);
run2 (100000, ofs, src, dst);
}
}
More information about the Mono-devel-list
mailing list