[Mono-devel-list] [Patch] Manged code is fast!

Fri May 21 08:29:04 EDT 2004

On 05/21/04 Andreas Nahr wrote:
> > > private unsafe static void CharCopy (char* source, char* destination,
> int count)
> >
> > What is the perf here if things are not dword aligned?
> 
> I think for me thing always were dword aligned. We should ensure that
> Strings always get the right alignment in the JIT.

We can guarantee the character data in a string will be aligned to a 4 byte
boundary, but CharCopy can called on data aligned to just 2.

> > > + while (count >= 16) {
> > > + *((int*) destination) = *((int*) source);
> > > + destination += 2;
> > > + source += 2;
> > > + *((int*) destination) = *((int*) source);
> > > + destination += 2;
> > > + source += 2;
> >
> > It is probably better to do something like:
> >
> > *((int*) dest + x) = ...
> 
> Did you really test this or are you just guessing?

What? It's much easier to talk than to test! Why should he test? :-)

> For me the above solution (although more source code) always produced
> superior speed.
> However I used the notation *((int*) dest[x]) =...
> But this seems to be compiled into same IL.

When you posted about the low performance and I changed the JIT to
produce faster code I also investigated a few methods in String and
methods to do copies. The basic thing to note is to keep the variables
used in the inner loop to 3 and to do clever unrolling. When unrolling
in a copy, for example you should not do:
	copy 1
	increase pointers by 1
	copy 1
	increase pointers by 1
	...

but the more efficient:
	copy 1
	copy 1
	copy 1
	copy 1
	increase pointers by 4

See the attached benchmarks for ideas: GetHashCode() is always faster
than the C version (on x86, on ppc it's faster until 200 chars and 20%
slower at 1000, but I didn't optimize that yet). It's twice as fast
as the current code so I'll get it in cvs in the next few days.
As for copies: I'd like to have something like the attached memcpy in
System.String and use it whenever a copy is required (it will eventually
be used also for the cpblk IL opcode). The memcpy is always faster than
the C version for me (except when the data is misaligned): I didn't have
the time to properly test if this is because of bugs in the code:-)
If someone would write a set of extensive tests for memcpy it'll be
appreciated.
Results from both benchmarks on different cpus are also appreciated:
please provide cpu type and speed and run with -O=all with mono from
cvs (-O=loop is enough to get most of the speed: I'll enable it by
default shortly since it has low impact on JIT time).
A memmove method is also needed for some of the string methods.
Thanks.

lupus

-- 
-----------------------------------------------------------------
lupus at debian.org                                     debian/rules
lupus at ximian.com                             Monkeys do it better
-------------- next part --------------
using System;

class T {

	static unsafe int Hash (string s) {
		//if (s.Length >= 1000)
		//	return s.GetHashCode ();
		fixed (char * c = s) {
			char * cc = c;
			char * end = cc + s.Length - 1;
			int h = 0;
			for (;cc < end; cc += 2) {
				h = (h << 5) - h + *cc;
				h = (h << 5) - h + cc [1];
			}
			++end;
			if (cc < end)
				h = (h << 5) - h + *cc;
			return h;
		}
	}

	static unsafe int Hash2 (string s) {
		//if (s.Length >= 1000)
		//	return s.GetHashCode ();
		fixed (char * c = s) {
			char * cc = c;
			char * end = cc + s.Length - 4;
			int h = 0;
			for (;cc < end; cc += 4) {
				h = (h << 5) - h + *cc;
				h = (h << 5) - h + cc [1];
				h = (h << 5) - h + cc [2];
				h = (h << 5) - h + cc [3];
			}
			end += 4;
			while (cc < end)
				h = (h << 5) - h + *cc++;

			return h;
		}
	}

	const int count = 1000000;
	static int test_new (int size, bool print) {
		string s = new String ('a', size);
		int start, end, i, v = 0;
		start = Environment.TickCount;
		for (i = 0; i < count; ++i) {
			v = Hash2 (s);
		}
		end = Environment.TickCount;
		if (print)
			Console.Write ("{0}\t{1}\t", size, end-start);
		return v;
	}

	static int test_old (int size, bool print) {
		string s = new String ('a', size);
		int start, end, i, v = 0;
		start = Environment.TickCount;
		for (i = 0; i < count; ++i) {
			v = s.GetHashCode ();
		}
		end = Environment.TickCount;
		if (print)
			Console.WriteLine (end-start);
		return v;
	}

	static void test (int size) {
		int v1 = test_new (size, true);
		int v2 = test_old (size, true);
		if (v1 != v2)
			Console.WriteLine ("failed compare");
	}
	static void Main () {
		int size;
		for (size = 0; size < 500; ) {
			test (size);
			if (size < 5)
				size += 1;
			else if (size < 50)
				size += 5;
			else if (size < 150)
				size += 11;
			else
				size += 21;
		}
		// degenerate cases: we should always cache the hash in these cases
		test (1000);
		test (10000);
	}
}

-------------- next part --------------
using System;

class T {
	static unsafe void memcpy4 (byte *dest, byte *src, int size) {
		while (size >= 32) {
			// using long is better than int and slower than double
			// FIXME: enable this only on correct alignment or on platforms
			// that can tolerate unaligned reads/writes of doubles
			((double*)dest) [0] = ((double*)src) [0];
			((double*)dest) [1] = ((double*)src) [1];
			((double*)dest) [2] = ((double*)src) [2];
			((double*)dest) [3] = ((double*)src) [3];
			dest += 32;
			src += 32;
			size -= 32;
		}
		while (size >= 16) {
			((int*)dest) [0] = ((int*)src) [0];
			((int*)dest) [1] = ((int*)src) [1];
			((int*)dest) [2] = ((int*)src) [2];
			((int*)dest) [3] = ((int*)src) [3];
			dest += 16;
			src += 16;
			size -= 16;
		}
		while (size >= 4) {
			((int*)dest) [0] = ((int*)src) [0];
			dest += 4;
			src += 4;
			size -= 4;
		}
		while (size > 0) {
			((byte*)dest) [0] = ((byte*)src) [0];
			dest += 1;
			src += 1;
			--size;
		}
	}
	static unsafe void memcpy2 (byte *dest, byte *src, int size) {
		while (size >= 8) {
			((short*)dest) [0] = ((short*)src) [0];
			((short*)dest) [1] = ((short*)src) [1];
			((short*)dest) [2] = ((short*)src) [2];
			((short*)dest) [3] = ((short*)src) [3];
			dest += 8;
			src += 8;
			size -= 8;
		}
		while (size >= 2) {
			((short*)dest) [0] = ((short*)src) [0];
			dest += 2;
			src += 2;
			size -= 2;
		}
		if (size > 0)
			((byte*)dest) [0] = ((byte*)src) [0];
	}
	static unsafe void memcpy1 (byte *dest, byte *src, int size) {
		while (size >= 8) {
			((byte*)dest) [0] = ((byte*)src) [0];
			((byte*)dest) [1] = ((byte*)src) [1];
			((byte*)dest) [2] = ((byte*)src) [2];
			((byte*)dest) [3] = ((byte*)src) [3];
			((byte*)dest) [4] = ((byte*)src) [4];
			((byte*)dest) [5] = ((byte*)src) [5];
			((byte*)dest) [6] = ((byte*)src) [6];
			((byte*)dest) [7] = ((byte*)src) [7];
			dest += 8;
			src += 8;
			size -= 8;
		}
		while (size >= 2) {
			((byte*)dest) [0] = ((byte*)src) [0];
			((byte*)dest) [1] = ((byte*)src) [1];
			dest += 2;
			src += 2;
			size -= 2;
		}
		if (size > 0)
			((byte*)dest) [0] = ((byte*)src) [0];
	}
	static unsafe void memcpy (byte *dest, byte *src, int size) {
		// FIXME: if pointers are not aligned, try to align them
		// so a faster routine can be used. Handle the case where
		// the pointers can't be reduced to have the same alignment
		// (just ignore the issue on x86?)
		if (((int)dest & 3) == 0 && ((int)src & 3) == 0) {
			memcpy4 (dest, src, size);
		} else if (((int)dest & 1) == 0 && ((int)src & 1) == 0) {
			if (size >= 6) {
				dest [0] = src [0];
				dest [1] = src [1];
				memcpy4 (dest + 2, src + 2, size - 2);
			} else {
				memcpy2 (dest, src, size);
			}
		} else {
			if (size >= 6) {
				dest [0] = src [0];
				++dest;
				++src;
				--size;
				if (((int)dest & 3) == 0 && ((int)src & 3) == 0) {
					memcpy4 (dest, src, size);
				} else if (((int)dest & 1) == 0 && ((int)src & 1) == 0) {
					if (size >= 6) {
						dest [0] = src [0];
						dest [1] = src [1];
						memcpy4 (dest + 2, src + 2, size - 2);
					} else {
						memcpy2 (dest, src, size);
					}
				} else {
					memcpy1 (dest, src, size);
				}
			} else {
				memcpy1 (dest, src, size);
			}
		}
	}
	static unsafe void BlockCopy (Array src, int srcOffset, Array dest, int destOffset, int count) {
		if (srcOffset < 0)
			throw new ArgumentOutOfRangeException ("srcOffset", "non-neg");
		if (destOffset < 0)
			throw new ArgumentOutOfRangeException ("destOffset", "non-neg");
		if (count < 0)
			throw new ArgumentOutOfRangeException ("count", "non-neg");
		// must check otherwise the fixed expression throws out-of-range exception
		if (count == 0)
			return;
		// need icall to simulate this and use ByteLength instead of Length
		byte[] s = (byte[])src;
		byte[] d = (byte[])dest;
		if (srcOffset + count > s.Length || destOffset + count > d.Length)
			throw new ArgumentException ("out of bounds");
		fixed (byte *A = s, B = d) {
			memcpy (B + destOffset, A + srcOffset, count);
		}
	}
	const int count = 2000000;
	static void run (int sz, int ofs, byte[] src, byte[] dst) {
		int start, end;

		start = Environment.TickCount;
		for (int i = 0; i < count; i++) {
			System.Buffer.BlockCopy (src, ofs, dst, ofs, sz - ofs);
		}
		end = Environment.TickCount;
		Console.Write ("{0}: {1} ", sz, end-start);
	}
	static unsafe void run1 (int sz, int ofs, byte[] src, byte[] dst) {
		int start, end;

		start = Environment.TickCount;
		for (int i = 0; i < count; i++) {
			BlockCopy (src, ofs, dst, ofs, sz - ofs);
		}
		end = Environment.TickCount;
		Console.Write (" {0} ", end-start);
	}
	static void run2 (int sz, int ofs, byte[] src, byte[] dst) {
		int start, end;

		start = Environment.TickCount;
		for (int i = 0; i < count; i++) {
			System.Array.Copy (src, ofs, dst, ofs, sz - ofs);
		}
		end = Environment.TickCount;
		Console.WriteLine (end-start);
	}
	static void Main (string[] args) {
		int i, ofs;
		i = Environment.TickCount;
		byte [] src;
		byte [] dst;
		ofs = 0;
		if (args.Length > 0)
			ofs = int.Parse (args [0]);
		Console.WriteLine ("Size: Buffer.bcopy memcpy Array.Copy (align: {0})", ofs);
		for (i = 1; i <= 10000;) {
			src = new byte [i];
			dst = new byte [i];
			if (i >= ofs) {
				run (i, ofs, src, dst);
				run1 (i, ofs, src, dst);
				run2 (i, ofs, src, dst);
			}
			if (i < 5)
				++i;
			else if (i < 50)
				i += 5;
			else if (i < 100)
				i += 10;
			else if (i < 1000)
				i += 100;
			else
				i += 1000;
		}
		src = new byte [100000];
		dst = new byte [100000];
		run (100000, ofs, src, dst);
		run1 (100000, ofs, src, dst);
		run2 (100000, ofs, src, dst);
	}
}