diff --git a/vsfilter/subtitles/Rasterizer.cpp b/vsfilter/subtitles/Rasterizer.cpp
index 5156436290c13656ca23c411925d1c8e497b988d..6212d645b8e287101d654dc88ff9f7c0532f978a 100644
--- a/vsfilter/subtitles/Rasterizer.cpp
+++ b/vsfilter/subtitles/Rasterizer.cpp
@@ -773,8 +773,21 @@ bool Rasterizer::Rasterize(int xsub, int ysub, bool fBlur)
 
 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
 {
-	int a = (((alpha)*(color>>24))>>12)&0xff;
+	int a = (((alpha)*(color>>24))>>6)&0xff;
+	// Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
 	int ia = 256-a;
+	a+=1;
+
+	*dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
+			| ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
+			| ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
+}
+
+static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
+{
+	int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
+	int ia = 256-a;
+	a+=1;
 
 	*dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
 			| ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
@@ -786,11 +799,30 @@ static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
 
 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
 {
-	alpha = ((alpha * (color>>24)) >> 12) & 0xff;
+	alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
 	color &= 0xffffff;
 
 	__m128i zero = _mm_setzero_si128();
-	__m128i a = _mm_set1_epi32((alpha << 16) | (0x100 - alpha));
+	__m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
+	__m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
+	__m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
+	__m128i r = _mm_unpacklo_epi16(d, s);
+
+	r = _mm_madd_epi16(r, a);
+	r = _mm_srli_epi32(r, 8);
+	r = _mm_packs_epi32(r, r);
+	r = _mm_packus_epi16(r, r);
+
+	*dst = (DWORD)_mm_cvtsi128_si32(r);
+}
+
+static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
+{
+	int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
+	color &= 0xffffff;
+
+	__m128i zero = _mm_setzero_si128();
+	__m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
 	__m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
 	__m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
 	__m128i r = _mm_unpacklo_epi16(d, s);
@@ -813,7 +845,8 @@ static const __int64 _00ff00ff00ff00ff = 0x00ff00ff00ff00ffi64;
 // clipRect is a rectangular clip region to render inside.
 // pAlphaMask is an alpha clipping mask.
 // xsub and ysub ???
-// switchpts seems to be an array of interlaced colour switching coordinates/colours to switch to.
+// switchpts seems to be an array of fill colours interlaced with coordinates.
+//    switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
 // fBody tells whether to render the body of the subs.
 // fBorder tells whether to render the border of the subs.
 CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int xsub, int ysub, const long* switchpts, bool fBody, bool fBorder)
@@ -853,13 +886,16 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 
 	// The alpha bitmap of the subtitles?
 	const byte* src = mpOverlayBuffer + 2*(mOverlayWidth * yo + xo);
+	// s points to what the "body" to use is
+	// If we're rendering body fill and border, src+1 points to the array of
+	// widened regions which contain both border and fill in one.
 	const byte* s = fBorder ? (src+1) : src;
 	// The complex "vector clip mask" I think.
 	const byte* am = pAlphaMask + spd.w * y + x;
 	// How would this differ from src?
 	unsigned long* dst = (unsigned long *)((char *)spd.bits + spd.pitch * y) + x;
 
-	// ??? What is switchpts ?
+	// Grab the first colour
 	unsigned long color = switchpts[0];
 
 	// CPUID from VDub
@@ -871,23 +907,24 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 		// Basic case of no complex clipping mask
 		if(!pAlphaMask)
 		{
-			// Again, what is switchpts?
+			// If the first colour switching coordinate is at "infinite" we're
+			// never switching and can use some simpler code.
+			// ??? Is this optimisation really worth the extra readability issues it adds?
 			if(switchpts[1] == 0xffffffff)
 			{
-				// Are we rendering the fill or a border/shadow? I think...
+				// fBody is true if we're rendering a fill or a shadow.
 				if(fBody)
 				{
 					// Run over every pixel, overlaying the subtitles with the fill colour
 					if(fSSE2)
 						for(int wt=0; wt<w; ++wt)
-							// Why s[wt*2] and not s[wt] ?
 							// The <<6 is due to pixmix expecting the alpha parameter to be
 							// the multiplication of two 6-bit unsigned numbers but we
 							// only have one here. (No alpha mask.)
-							pixmix_sse2(&dst[wt], color, s[wt*2]<<6);
+							pixmix_sse2(&dst[wt], color, s[wt*2]);
 					else
 						for(int wt=0; wt<w; ++wt)
-							pixmix(&dst[wt], color, s[wt*2]<<6);
+							pixmix(&dst[wt], color, s[wt*2]);
 				}
 				// Not body, ie. something else (border, shadow, I guess)
 				else
@@ -902,10 +939,10 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							// created by CreateWidenedRegion, and thus contains
 							// both the fill and the border, so subtracting the fill
 							// from that is always safe.
-							pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6);
+							pixmix_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2]);
 					else
 						for(int wt=0; wt<w; ++wt)
-							pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6);
+							pixmix(&dst[wt], color, src[wt*2+1] - src[wt*2]);
 				}
 			}
 			// not (switchpts[1] == 0xffffffff)
@@ -923,13 +960,13 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 						// So if we have passed the switchpoint (?) switch to another colour
 						// (So switchpts stores both colours *and* coordinates?)
 						if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
-						pixmix_sse2(&dst[wt], color, s[wt*2]<<6);
+						pixmix_sse2(&dst[wt], color, s[wt*2]);
 					}
 					else
 					for(int wt=0; wt<w; ++wt)
 					{
 						if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
-						pixmix(&dst[wt], color, s[wt*2]<<6);
+						pixmix(&dst[wt], color, s[wt*2]);
 					}
 				}
 				// Not body
@@ -939,13 +976,13 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 					for(int wt=0; wt<w; ++wt)
 					{
 						if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} 
-						pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6);
+						pixmix_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2]);
 					}
 					else
 					for(int wt=0; wt<w; ++wt)
 					{
 						if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} 
-						pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6);
+						pixmix(&dst[wt], color, src[wt*2+1] - src[wt*2]);
 					}
 				}
 			}
@@ -957,12 +994,6 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 			{
 				if(fBody)
 				{
-					/*const byte* s = fBorder?(src+1):src;
-
-					for(int wt=0; wt<w; ++wt)
-					{
-						pixmix2(s[wt*2]);
-					}*/
 					if(fSSE2)
 						for(int wt=0; wt<w; ++wt)
 							// Both s and am contain 6-bit bitmaps of two different
@@ -970,23 +1001,19 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							// clipping mask.
 							// Multiplying them together yields a 12-bit number.
 							// I think some imprecision is introduced here??
-							pixmix_sse2(&dst[wt], color, s[wt*2] * am[wt]);
+							pixmix2_sse2(&dst[wt], color, s[wt*2], am[wt]);
 					else
 						for(int wt=0; wt<w; ++wt)
-							pixmix(&dst[wt], color, s[wt*2] * am[wt]);
+							pixmix2(&dst[wt], color, s[wt*2], am[wt]);
 				}
 				else
 				{
-					/*for(int wt=0; wt<w; ++wt)
-					{
-						pixmix2(src[wt*2+1]-src[wt*2]);
-					}*/
 					if(fSSE2)
 						for(int wt=0; wt<w; ++wt)
-							pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]);
+							pixmix2_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
 					else
 						for(int wt=0; wt<w; ++wt)
-							pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]);
+							pixmix2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
 				}
 			}
 			else
@@ -995,18 +1022,6 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 
 				if(fBody)
 				{
-					/*const byte* s = fBorder?(src+1):src;
-					
-					for(int wt=0; wt<w; ++wt)
-					{
-						if(wt+xo >= sw[1])
-						{
-							while(wt+xo >= sw[1]) sw += 2;
-							color = sw[-2];
-						}
-
-						pixmix2(s[wt*2]);
-					}*/
 					if(fSSE2) 
 					for(int wt=0; wt<w; ++wt)
 					{
@@ -1014,7 +1029,7 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							while(wt+xo >= sw[1])
 								sw += 2; color = sw[-2];
 						}
-						pixmix_sse2(&dst[wt], color, s[wt*2] * am[wt]);
+						pixmix2_sse2(&dst[wt], color, s[wt*2], am[wt]);
 					}
 					else
 					for(int wt=0; wt<w; ++wt)
@@ -1023,21 +1038,11 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							while(wt+xo >= sw[1])
 								sw += 2; color = sw[-2];
 						}
-						pixmix(&dst[wt], color, s[wt*2] * am[wt]);
+						pixmix2(&dst[wt], color, s[wt*2], am[wt]);
 					}
 				}
 				else
 				{
-					/*for(int wt=0; wt<w; ++wt)
-					{
-						if(wt+xo >= sw[1])
-						{
-							while(wt+xo >= sw[1]) sw += 2;
-							color = sw[-2];
-						}
-
-						pixmix2(src[wt*2+1]-src[wt*2]);
-					}*/
 					if(fSSE2) 
 					for(int wt=0; wt<w; ++wt)
 					{
@@ -1045,7 +1050,7 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							while(wt+xo >= sw[1])
 								sw += 2; color = sw[-2];
 						} 
-						pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]);
+						pixmix2_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
 					}
 					else
 					for(int wt=0; wt<w; ++wt)
@@ -1054,7 +1059,7 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x
 							while(wt+xo >= sw[1])
 								sw += 2; color = sw[-2];
 						} 
-						pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]);
+						pixmix2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]);
 					}
 				}
 			}