From 0b2511ea90cc24361ac74546c9ac1334ddadebd9 Mon Sep 17 00:00:00 2001
From: aliaspider <aliaspider@gmail.com>
Date: Thu, 29 Mar 2018 23:46:02 +0100
Subject: [PATCH 1/3] D3D: use premultiplied alpha in the GPU texture scale
 filters.

---
 GPU/Directx9/PixelShaderGeneratorDX9.cpp | 117 +++++++++++++----------
 1 file changed, 67 insertions(+), 50 deletions(-)

diff --git a/GPU/Directx9/PixelShaderGeneratorDX9.cpp b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
index 3813de00072f..026b889753b8 100644
--- a/GPU/Directx9/PixelShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
@@ -64,11 +64,11 @@ static const char* sampler_gaussian =
 	"    pos.x = offset.x - i;\n"
 	"    for (j = -2.0; j< 2.0 ;j++){\n"
 	"      pos.y = offset.y - j;\n"
-	"      c=tex_sample_direct(coord - pos * u_texSize.zw).rgba;\n"
+	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
 	"      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);\n"
 	"    }\n"
 	"  }\n"
-	"  return tempColor;\n"
+	"  return postdivide_alpha(tempColor);\n"
 	"};\n";
 static const char* sampler_cosine =
 	"#define sharpness 1.0\n"
@@ -88,21 +88,21 @@ static const char* sampler_cosine =
 	"    pos.x = offset.x - i;\n"
 	"    for (j = -2.0; j< 2.0 ;j++){\n"
 	"      pos.y = offset.y - j;\n"
-	"      c=tex_sample_direct(coord - pos * u_texSize.zw).rgba;\n"
+	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
 	"      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);\n"
 	"    }\n"
 	"  }\n"
-	"  return tempColor.rgba;\n"
+	"  return postdivide_alpha(tempColor);\n"
 	"};\n";
 static const char* sampler_xbrz =
-	"float c_df(float4 c1, float4 c2) {\n"
-	"	float4 df = abs(c1 - c2);\n"
-	"	return df.r + df.g + df.b + df.a;\n"
+	"float c_df(float3 c1, float3 c2) {\n"
+	"	float3 df = abs(c1 - c2);\n"
+	"	return df.r + df.g + df.b;\n"
 	"}\n"
 	"\n"
 	"static const float coef = 2.0;\n"
 	"\n"
-	"static const float4  rgbw          = float4(14.352, 28.176, 5.472, 15.0);\n"
+	"static const float3  rgbw          = float3(14.352, 28.176, 5.472);\n"
 	"static const float4  eq_threshold  = float4(15.0, 15.0, 15.0, 15.0);\n"
 	"\n"
 	"static const float4 delta   = float4(1.0/4., 1.0/4., 1.0/4., 1.0/4.);\n"
@@ -154,6 +154,7 @@ static const char* sampler_xbrz =
 	"{\n"
 	"	float2 tc = coord * u_texSize.xy;\n"
 	"	float2 fp = frac(tc);\n"
+	"	tc = floor(tc) + 0.5;\n"
 	"\n"
 	"	float4 xyp_01_02_03 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1., -2.));\n"
 	"	float4 xyp_06_07_08 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1., -1.));\n"
@@ -178,39 +179,39 @@ static const char* sampler_xbrz =
 	"	float4 fx_u; // inequations of straight lines.\n"
 	"\n"
 	"\n"
-	"	float4 A1 = tex_sample_direct(xyp_01_02_03.xw);\n"
-	"	float4 B1 = tex_sample_direct(xyp_01_02_03.yw);\n"
-	"	float4 C1 = tex_sample_direct(xyp_01_02_03.zw);\n"
-	"	float4 A  = tex_sample_direct(xyp_06_07_08.xw);\n"
-	"	float4 B  = tex_sample_direct(xyp_06_07_08.yw);\n"
-	"	float4 C  = tex_sample_direct(xyp_06_07_08.zw);\n"
-	"	float4 D  = tex_sample_direct(xyp_11_12_13.xw);\n"
-	"	float4 E  = tex_sample_direct(xyp_11_12_13.yw);\n"
-	"	float4 F  = tex_sample_direct(xyp_11_12_13.zw);\n"
-	"	float4 G  = tex_sample_direct(xyp_16_17_18.xw);\n"
-	"	float4 H  = tex_sample_direct(xyp_16_17_18.yw);\n"
-	"	float4 I  = tex_sample_direct(xyp_16_17_18.zw);\n"
-	"	float4 G5 = tex_sample_direct(xyp_21_22_23.xw);\n"
-	"	float4 H5 = tex_sample_direct(xyp_21_22_23.yw);\n"
-	"	float4 I5 = tex_sample_direct(xyp_21_22_23.zw);\n"
-	"	float4 A0 = tex_sample_direct(xyp_05_10_15.xy);\n"
-	"	float4 D0 = tex_sample_direct(xyp_05_10_15.xz);\n"
-	"	float4 G0 = tex_sample_direct(xyp_05_10_15.xw);\n"
-	"	float4 C4 = tex_sample_direct(xyp_09_14_09.xy);\n"
-	"	float4 F4 = tex_sample_direct(xyp_09_14_09.xz);\n"
-	"	float4 I4 = tex_sample_direct(xyp_09_14_09.xw);\n"
-	"\n"
-	"	float4 b  = float4(dot(B ,rgbw), dot(D ,rgbw), dot(H ,rgbw), dot(F ,rgbw));\n"
-	"	float4 c  = float4(dot(C ,rgbw), dot(A ,rgbw), dot(G ,rgbw), dot(I ,rgbw));\n"
+	"	float4 A1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.xw));\n"
+	"	float4 B1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.yw));\n"
+	"	float4 C1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.zw));\n"
+	"	float4 A  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.xw));\n"
+	"	float4 B  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.yw));\n"
+	"	float4 C  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.zw));\n"
+	"	float4 D  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.xw));\n"
+	"	float4 E  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.yw));\n"
+	"	float4 F  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.zw));\n"
+	"	float4 G  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.xw));\n"
+	"	float4 H  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.yw));\n"
+	"	float4 I  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.zw));\n"
+	"	float4 G5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.xw));\n"
+	"	float4 H5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.yw));\n"
+	"	float4 I5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.zw));\n"
+	"	float4 A0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xy));\n"
+	"	float4 D0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xz));\n"
+	"	float4 G0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xw));\n"
+	"	float4 C4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xy));\n"
+	"	float4 F4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xz));\n"
+	"	float4 I4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xw));\n"
+	"\n"
+	"	float4 b  = float4(dot(B.rgb ,rgbw), dot(D.rgb ,rgbw), dot(H.rgb ,rgbw), dot(F.rgb ,rgbw));\n"
+	"	float4 c  = float4(dot(C.rgb ,rgbw), dot(A.rgb ,rgbw), dot(G.rgb ,rgbw), dot(I.rgb ,rgbw));\n"
 	"	float4 d  = b.yzwx;\n"
-	"	float4 e  = dot(E,rgbw).xxxx;\n"
+	"	float4 e  = dot(E.rgb,rgbw).xxxx;\n"
 	"	float4 f  = b.wxyz;\n"
 	"	float4 g  = c.zwxy;\n"
 	"	float4 h  = b.zwxy;\n"
 	"	float4 i  = c.wxyz;\n"
-	"	float4 i4 = float4(dot(I4,rgbw), dot(C1,rgbw), dot(A0,rgbw), dot(G5,rgbw));\n"
-	"	float4 i5 = float4(dot(I5,rgbw), dot(C4,rgbw), dot(A1,rgbw), dot(G0,rgbw));\n"
-	"	float4 h5 = float4(dot(H5,rgbw), dot(F4,rgbw), dot(B1,rgbw), dot(D0,rgbw));\n"
+	"	float4 i4 = float4(dot(I4.rgb,rgbw), dot(C1.rgb,rgbw), dot(A0.rgb,rgbw), dot(G5.rgb,rgbw));\n"
+	"	float4 i5 = float4(dot(I5.rgb,rgbw), dot(C4.rgb,rgbw), dot(A1.rgb,rgbw), dot(G0.rgb,rgbw));\n"
+	"	float4 h5 = float4(dot(H5.rgb,rgbw), dot(F4.rgb,rgbw), dot(B1.rgb,rgbw), dot(D0.rgb,rgbw));\n"
 	"	float4 f4 = h5.yzwx;\n"
 	"\n"
 	"	// These inequations define the line below which interpolation occurs.\n"
@@ -218,9 +219,14 @@ static const char* sampler_xbrz =
 	"	fx_l = (Ax*fp.y+Bx*fp.x);\n"
 	"	fx_u = (Ay*fp.y+By*fp.x);\n"
 	"\n"
-	"	irlv1 = irlv0 = diff(e,f) * diff(e,h);\n"
+	"	irlv0 = diff(e,f) * diff(e,h);\n"
 	"\n"
-	"	irlv1     = (irlv0  * ( neq(f,b) * neq(f,c) + neq(h,d) * neq(h,g) + eq(e,i) * (neq(f,f4) * neq(f,i4) + neq(h,h5) * neq(h,i5)) + eq(e,g) + eq(e,c)) );\n"
+//	"	irlv1 = irlv0;\n"
+//	"  irlv1 = (irlv0  * ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) );\n"
+//	"  float4 c1 = i4.yzwx;\n"
+//	"  float4 g0 = i5.wxyz;\n"
+//	"  irlv1 = (irlv0  * ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) * (diff(f,f4) * diff(f,i) + diff(h,h5) * diff(h,i) + diff(h,g) + diff(f,c) + eq(b,c1) * eq(d,g0)));\n"
+	"	irlv1 = (irlv0  * ( neq(f,b) * neq(f,c) + neq(h,d) * neq(h,g) + eq(e,i) * (neq(f,f4) * neq(f,i4) + neq(h,h5) * neq(h,i5)) + eq(e,g) + eq(e,c)) );\n"
 	"\n"
 	"	irlv2l = diff(e,g) * diff(d,g);\n"
 	"	irlv2u = diff(e,c) * diff(b,c);\n"
@@ -246,6 +252,7 @@ static const char* sampler_xbrz =
 	"	px = step(df(e,f), df(e,h));\n"
 	"\n"
 	"	float4 maximos = max(max(fx30, fx60), max(fx45, fx45i));\n"
+//	"  float4 maximos = max(max(fx30, fx60), fx45);"
 	"\n"
 	"	float4 res1 = E;\n"
 	"	res1 = lerp(res1, lerp(H, F, px.x), maximos.x);\n"
@@ -255,12 +262,13 @@ static const char* sampler_xbrz =
 	"	res2 = lerp(res2, lerp(F, B, px.y), maximos.y);\n"
 	"	res2 = lerp(res2, lerp(D, H, px.w), maximos.w);\n"
 	"\n"
-	"	return lerp(res1, res2, step(c_df(E, res1), c_df(E, res2)));\n"
+	"	float4 res = lerp(res1, res2, step(c_df(E.rgb, res1.rgb), c_df(E.rgb, res2.rgb)));\n"
+	"  return postdivide_alpha(res);\n"
 	"}\n";
 static const char* sampler_sabr =
 	"float c_df(float4 c1, float4 c2) {\n"
-	"	float4 df = abs(c1 - c2);\n"
-	"	return df.r + df.g + df.b + df.a;\n"
+	"	float3 df = abs(c1.rgb - c2.rgb);\n"
+	"	return df.r + df.g + df.b;\n"
 	"}\n"
 	"static const  float4 Ai  = float4( 1.0, -1.0, -1.0,  1.0);\n"
 	"static const  float4 B45 = float4( 1.0,  1.0, -1.0, -1.0);\n"
@@ -270,7 +278,7 @@ static const char* sampler_sabr =
 	"static const  float3 lum = float3(0.21, 0.72, 0.07);\n"
 	"\n"
 	"float lum_to(float4 v) {\n"
-	"  return dot(lum, v.rgb) * v.a;\n"
+	"  return dot(lum, v.rgb);\n"
 	"}\n"
 	"float4 lum_to(float4 v0, float4 v1, float4 v2, float4 v3) {\n"
 	"  return float4(lum_to(v0), lum_to(v1), lum_to(v2), lum_to(v3));\n"
@@ -290,11 +298,11 @@ static const char* sampler_sabr =
 	"	+-----+-----+-----+\n"
 	"*/\n"
 	"// Store mask values\n"
-	"  float4 P07 = tex_sample_direct(coord + u_texSize.zw * float2( 0.0, -1.0));\n"
-	"  float4 P11 = tex_sample_direct(coord + u_texSize.zw * float2(-1.0,  0.0));\n"
-	"  float4 P12 = tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  0.0));\n"
-	"  float4 P13 = tex_sample_direct(coord + u_texSize.zw * float2( 1.0,  0.0));\n"
-	"  float4 P17 = tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  1.0));\n"
+	"  float4 P07 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0, -1.0)));\n"
+	"  float4 P11 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2(-1.0,  0.0)));\n"
+	"  float4 P12 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  0.0)));\n"
+	"  float4 P13 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 1.0,  0.0)));\n"
+	"  float4 P17 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  1.0)));\n"
 	"\n"
 	"  // Store luminance values of each point\n"
 	"  float4 p7  = lum_to(P07, P11, P17, P13);\n"
@@ -319,7 +327,7 @@ static const char* sampler_sabr =
 	"  res2 = lerp(res2, lerp(P13, P17, px.x), ma45.x);\n"
 	"\n"
 	"  float4 res = lerp(res1, res2, step(c_df(P12, res1), c_df(P12, res2)));\n"
-	"  return res;\n"
+	"  return postdivide_alpha(res);\n"
 	"}\n";
 
 static const char* sampler_hybrid = sampler_xbrz;
@@ -357,11 +365,11 @@ static const char* sampler_bicubic =
 	"    pos.x = offset.x - i;\n"
 	"    for (j = -2.0; j < 3.0 ;j++){\n"
 	"      pos.y = offset.y - j;\n"
-	"      c=tex_sample_direct(coord - pos * u_texSize.zw).rgba;\n"
+	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
 	"      tempColor+=c*mitchell(pos);\n"
 	"    }\n"
 	"  }\n"
-	"  return tempColor;\n"
+	"  return postdivide_alpha(tempColor);\n"
 	"};\n";
 
 // Missing: Z depth range
@@ -484,6 +492,15 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
 	WRITE(p, "};\n");
 
 	if (!isModeClear && doTexture) {
+		if (doTextureAlpha) {
+			// TODO: check why the [0.0,1.0] clamp is necessary here
+			WRITE(p, "float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }");
+			WRITE(p, "float4 postdivide_alpha(float4 c) { return c.a < 0.001f? 0.0f : float4(c.rgb / c.a, c.a); }\n");
+		} else {
+			WRITE(p, "#define premultiply_alpha(c) (c)\n");
+			WRITE(p, "#define postdivide_alpha(c) (c)\n");
+		}
+
 		WRITE(p, "float4 tex_sample_direct(float2 coord) {\n");
 		if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
 				WRITE(p, "  return tex.Sample(samp, coord)%s;\n", bgraTexture ? ".bgra" : "");

From 45a08097688ffd94eba2f0aff8922c2347b14ef4 Mon Sep 17 00:00:00 2001
From: aliaspider <aliaspider@gmail.com>
Date: Fri, 30 Mar 2018 19:21:57 +0100
Subject: [PATCH 2/3] D3D: load gpu scaler code from the assets folder at
 runtime. also add xbrz code in addition to xbr.

---
 CMakeLists.txt                             |   3 +-
 GPU/Common/TextureScalerCommon.cpp         |   5 +-
 GPU/Common/TextureScalerCommon.h           |   2 +-
 GPU/D3D11/FragmentShaderGeneratorD3D11.cpp |   4 +-
 GPU/D3D11/FragmentShaderGeneratorD3D11.h   |   2 +-
 GPU/D3D11/ShaderManagerD3D11.cpp           |  32 +-
 GPU/D3D11/ShaderManagerD3D11.h             |   2 +
 GPU/Directx9/PixelShaderGeneratorDX9.cpp   | 376 +--------------------
 GPU/Directx9/PixelShaderGeneratorDX9.h     |   2 +-
 GPU/Directx9/ShaderManagerDX9.cpp          |   2 +-
 UI/GameSettingsScreen.cpp                  |   2 +-
 Windows/MainWindowMenu.cpp                 |   2 +
 Windows/ppsspp.rc                          |   1 +
 Windows/resource.h                         |   7 +-
 assets/scalers/bicubic.hlsl                |  49 +++
 assets/scalers/cosine.hlsl                 |  24 ++
 assets/scalers/gaussian.hlsl               |  33 ++
 assets/scalers/hybrid.hlsl                 |   4 +
 assets/scalers/hybrid_bicubic.hlsl         |   4 +
 assets/scalers/sabr.hlsl                   |  64 ++++
 assets/scalers/xbr.hlsl                    | 210 ++++++++++++
 assets/scalers/xbrz.hlsl                   | 335 ++++++++++++++++++
 libretro/libretro.cpp                      |   2 +-
 23 files changed, 779 insertions(+), 388 deletions(-)
 create mode 100644 assets/scalers/bicubic.hlsl
 create mode 100644 assets/scalers/cosine.hlsl
 create mode 100644 assets/scalers/gaussian.hlsl
 create mode 100644 assets/scalers/hybrid.hlsl
 create mode 100644 assets/scalers/hybrid_bicubic.hlsl
 create mode 100644 assets/scalers/sabr.hlsl
 create mode 100644 assets/scalers/xbr.hlsl
 create mode 100644 assets/scalers/xbrz.hlsl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff1f78a1d287..39b40360eb55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1851,7 +1851,8 @@ endif()
 set(NativeAssets
 	android/assets/ui_atlas.zim
 	assets/lang
-	assets/shaders
+   assets/shaders
+   assets/scalers
 	assets/Roboto-Condensed.ttf
 	assets/7z.png
 	assets/compat.ini
diff --git a/GPU/Common/TextureScalerCommon.cpp b/GPU/Common/TextureScalerCommon.cpp
index bf4d9f14e82f..d01dd3e605a2 100644
--- a/GPU/Common/TextureScalerCommon.cpp
+++ b/GPU/Common/TextureScalerCommon.cpp
@@ -555,8 +555,9 @@ bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &
 
 	// scale 
 	switch (g_Config.iTexScalingType) {
+	case XBR:
 	case SABR:
-		// no cpu implementation for sabr, fall back to xbrz
+		// no cpu implementation, fall back to xbrz
 	case XBRZ:
 		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
 		break;
@@ -565,7 +566,7 @@ bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &
 		break;
 	case GAUSSIAN:
 	case COSINE:
-		// no cpu implementation for those, fall back to bicubic
+		// no cpu implementation, fall back to bicubic
 	case BICUBIC:
 		ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
 		break;
diff --git a/GPU/Common/TextureScalerCommon.h b/GPU/Common/TextureScalerCommon.h
index dced8353d03d..8711f77b7cba 100644
--- a/GPU/Common/TextureScalerCommon.h
+++ b/GPU/Common/TextureScalerCommon.h
@@ -31,7 +31,7 @@ class TextureScalerCommon {
 	bool Scale(u32 *&data, u32 &dstfmt, int &width, int &height, int factor);
 	bool ScaleInto(u32 *out, u32 *src, u32 &dstfmt, int &width, int &height, int factor);
 
-	enum { XBRZ = 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3, SABR = 4, GAUSSIAN = 5, COSINE = 6 };
+	enum { XBRZ = 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3, XBR = 4, SABR = 5, GAUSSIAN = 6, COSINE = 7 };
 
 protected:
 	virtual void ConvertTo8888(u32 format, u32 *source, u32 *&dest, int width, int height) = 0;
diff --git a/GPU/D3D11/FragmentShaderGeneratorD3D11.cpp b/GPU/D3D11/FragmentShaderGeneratorD3D11.cpp
index 07d37c8d5ce1..61a23e26df8f 100644
--- a/GPU/D3D11/FragmentShaderGeneratorD3D11.cpp
+++ b/GPU/D3D11/FragmentShaderGeneratorD3D11.cpp
@@ -19,6 +19,6 @@
 #include "GPU/D3D11/FragmentShaderGeneratorD3D11.h"
 #include "GPU/Directx9/PixelShaderGeneratorDX9.h"
 
-void GenerateFragmentShaderD3D11(const FShaderID &id, char *buffer, ShaderLanguage lang) {
-	DX9::GenerateFragmentShaderHLSL(id, buffer, lang);
+void GenerateFragmentShaderD3D11(const FShaderID &id, char *buffer, char* scalerCode, ShaderLanguage lang) {
+	DX9::GenerateFragmentShaderHLSL(id, buffer, scalerCode, lang);
 }
diff --git a/GPU/D3D11/FragmentShaderGeneratorD3D11.h b/GPU/D3D11/FragmentShaderGeneratorD3D11.h
index cf94a63200bc..927682e799ca 100644
--- a/GPU/D3D11/FragmentShaderGeneratorD3D11.h
+++ b/GPU/D3D11/FragmentShaderGeneratorD3D11.h
@@ -19,4 +19,4 @@
 
 #include "GPU/Common/ShaderId.h"
 
-void GenerateFragmentShaderD3D11(const FShaderID &id, char *buffer, ShaderLanguage lang);
+void GenerateFragmentShaderD3D11(const FShaderID &id, char *buffer, char *scalerCode, ShaderLanguage lang);
diff --git a/GPU/D3D11/ShaderManagerD3D11.cpp b/GPU/D3D11/ShaderManagerD3D11.cpp
index 7e79d5722450..2ce967bf2065 100644
--- a/GPU/D3D11/ShaderManagerD3D11.cpp
+++ b/GPU/D3D11/ShaderManagerD3D11.cpp
@@ -26,6 +26,7 @@
 #include "math/lin/matrix4x4.h"
 #include "math/math_util.h"
 #include "math/dataconv.h"
+#include "file/vfs.h"
 #include "util/text/utf8.h"
 #include "Common/Common.h"
 #include "Core/Config.h"
@@ -89,8 +90,8 @@ std::string D3D11VertexShader::GetShaderString(DebugShaderStringType type) const
 }
 
 ShaderManagerD3D11::ShaderManagerD3D11(ID3D11Device *device, ID3D11DeviceContext *context, D3D_FEATURE_LEVEL featureLevel)
-	: device_(device), context_(context), featureLevel_(featureLevel), lastVShader_(nullptr), lastFShader_(nullptr) {
-	codeBuffer_ = new char[16384];
+	: device_(device), context_(context), featureLevel_(featureLevel), lastVShader_(nullptr), lastFShader_(nullptr), scalerCode_(nullptr) {
+	codeBuffer_ = new char[16384 * 2];
 	memset(&ub_base, 0, sizeof(ub_base));
 	memset(&ub_lights, 0, sizeof(ub_lights));
 
@@ -101,6 +102,7 @@ ShaderManagerD3D11::ShaderManagerD3D11(ID3D11Device *device, ID3D11DeviceContext
 	ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_base));
 	desc.ByteWidth = sizeof(ub_lights);
 	ASSERT_SUCCESS(device_->CreateBuffer(&desc, nullptr, &push_lights));
+	needScalerCode = g_Config.bRealtimeTexScaling && g_Config.iTexScalingLevel != 1;
 }
 
 ShaderManagerD3D11::~ShaderManagerD3D11() {
@@ -128,6 +130,9 @@ void ShaderManagerD3D11::ClearShaders() {
 	Clear();
 	DirtyLastShader();
 	gstate_c.Dirty(DIRTY_ALL_UNIFORMS);
+	free(scalerCode_);
+	scalerCode_ = nullptr;
+	needScalerCode = g_Config.bRealtimeTexScaling && g_Config.iTexScalingLevel != 1;
 }
 
 void ShaderManagerD3D11::DirtyLastShader() {
@@ -208,7 +213,28 @@ void ShaderManagerD3D11::GetShaders(int prim, u32 vertType, D3D11VertexShader **
 	D3D11FragmentShader *fs;
 	if (fsIter == fsCache_.end()) {
 		// Fragment shader not in cache. Let's compile it.
-		GenerateFragmentShaderD3D11(FSID, codeBuffer_, featureLevel_ <= D3D_FEATURE_LEVEL_9_3 ? HLSL_D3D11_LEVEL9 : HLSL_D3D11);
+		if(needScalerCode && !scalerCode_) {
+			static const char* filenames[] = {
+				"scalers/xbrz.hlsl",
+				"scalers/hybrid.hlsl",
+				"scalers/bicubic.hlsl",
+				"scalers/hybrid_bicubic.hlsl",
+				"scalers/xbr.hlsl",
+				"scalers/sabr.hlsl",
+				"scalers/gaussian.hlsl",
+				"scalers/cosine.hlsl",
+			};
+			if ((g_Config.iTexScalingType < 0) || (g_Config.iTexScalingType >= sizeof(filenames) / sizeof(*filenames))) {
+				ERROR_LOG(G3D, "Unknown scaling type: %i", g_Config.iTexScalingType);
+			} else {
+				size_t sz;
+				scalerCode_ = (char *)VFSReadFile(filenames[g_Config.iTexScalingType], &sz);
+				if(!scalerCode_)
+					ERROR_LOG(G3D, "Scaler not found: %s", filenames[g_Config.iTexScalingType]);
+			}
+			needScalerCode = false;
+		}
+		GenerateFragmentShaderD3D11(FSID, codeBuffer_, scalerCode_, featureLevel_ <= D3D_FEATURE_LEVEL_9_3 ? HLSL_D3D11_LEVEL9 : HLSL_D3D11);
 		fs = new D3D11FragmentShader(device_, featureLevel_, FSID, codeBuffer_, useHWTransform);
 		fsCache_[FSID] = fs;
 	} else {
diff --git a/GPU/D3D11/ShaderManagerD3D11.h b/GPU/D3D11/ShaderManagerD3D11.h
index cc0da972bd39..656b19900e91 100644
--- a/GPU/D3D11/ShaderManagerD3D11.h
+++ b/GPU/D3D11/ShaderManagerD3D11.h
@@ -116,6 +116,8 @@ class ShaderManagerD3D11 : public ShaderManagerCommon {
 	VSCache vsCache_;
 
 	char *codeBuffer_;
+	char *scalerCode_;
+	bool needScalerCode;
 
 	// Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time.
 	UB_VS_FS_Base ub_base;
diff --git a/GPU/Directx9/PixelShaderGeneratorDX9.cpp b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
index 026b889753b8..6207ab22d850 100644
--- a/GPU/Directx9/PixelShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
@@ -32,349 +32,9 @@
 
 namespace DX9 {
 
-static const char* sampler_default =
-	"float4 tex_sample(float2 coord) {\n"
-	"  return tex_sample_direct(coord);\n"
-	"};\n";
-
-static const char* sampler_gaussian =
-	"#define sharpness 1.0\n"
-	"#define pi 3.14159265358\n"
-	"#define normalGauss(x) ((exp(-(x)*(x)*0.5))/sqrt(2.0*pi))\n"
-	"#define normalGauss2(x) (normalGauss(x - 0.5) - 0.5)\n"
-	"float normalGaussIntegral(float x)\n"
-	"{\n"
-	"	 float a1 = 0.4361836;\n"
-	"	 float a2 = -0.1201676;\n"
-	"	 float a3 = 0.9372980;\n"
-	"	 float p = 0.3326700;\n"
-	"	 float t = 1.0 / (1.0 + p*abs(x));\n"
-	"\n"
-	"	 return (0.5-normalGauss(x) * (t*(a1 + t*(a2 + a3*t))))*sign(x);\n"
-	"}\n"
-	"#define KERNEL(x,b) (normalGaussIntegral(sqrt(2*pi)*b*(x - 0.5)) - normalGaussIntegral(sqrt(2*pi)*b*(x + 0.5)))\n"
-	"\n"
-	"float4 tex_sample(float2 coord) {\n"
-	"  float2 offset = frac(coord * u_texSize.xy) - 0.5;\n"
-	"  float4 tempColor = 0.0;\n"
-	"  float4 c;\n"
-	"  float i,j;\n"
-	"  float2 pos;\n"
-	"  for (i = -2.0; i < 2.0; i++){\n"
-	"    pos.x = offset.x - i;\n"
-	"    for (j = -2.0; j< 2.0 ;j++){\n"
-	"      pos.y = offset.y - j;\n"
-	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
-	"      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);\n"
-	"    }\n"
-	"  }\n"
-	"  return postdivide_alpha(tempColor);\n"
-	"};\n";
-static const char* sampler_cosine =
-	"#define sharpness 1.0\n"
-	"#define pi 3.14159265358\n"
-	"#define a(x) abs(x)\n"
-	"#define d(x,b) (pi*b*min(a(x)+0.5,1.0/b))\n"
-	"#define e(x,b) (pi*b*min(max(a(x)-0.5,-1.0/b),1.0/b))\n"
-	"#define KERNEL(x,b) ((d(x,b)+sin(d(x,b))-e(x,b)-sin(e(x,b)))/(2.0*pi))\n"
-	"\n"
-	"float4 tex_sample(float2 coord) {\n"
-	"  float2 offset = frac(coord * u_texSize.xy) - 0.5;\n"
-	"  float4 tempColor = 0.0;\n"
-	"  float4 c;\n"
-	"  float i,j;\n"
-	"  float2 pos;\n"
-	"  for (i = -2.0; i < 2.0; i++){\n"
-	"    pos.x = offset.x - i;\n"
-	"    for (j = -2.0; j< 2.0 ;j++){\n"
-	"      pos.y = offset.y - j;\n"
-	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
-	"      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);\n"
-	"    }\n"
-	"  }\n"
-	"  return postdivide_alpha(tempColor);\n"
-	"};\n";
-static const char* sampler_xbrz =
-	"float c_df(float3 c1, float3 c2) {\n"
-	"	float3 df = abs(c1 - c2);\n"
-	"	return df.r + df.g + df.b;\n"
-	"}\n"
-	"\n"
-	"static const float coef = 2.0;\n"
-	"\n"
-	"static const float3  rgbw          = float3(14.352, 28.176, 5.472);\n"
-	"static const float4  eq_threshold  = float4(15.0, 15.0, 15.0, 15.0);\n"
-	"\n"
-	"static const float4 delta   = float4(1.0/4., 1.0/4., 1.0/4., 1.0/4.);\n"
-	"static const float4 delta_l = float4(0.5/4., 1.0/4., 0.5/4., 1.0/4.);\n"
-	"static const float4 delta_u = delta_l.yxwz;\n"
-	"\n"
-	"static const float4 Ao = float4( 1.0, -1.0, -1.0, 1.0 );\n"
-	"static const float4 Bo = float4( 1.0,  1.0, -1.0,-1.0 );\n"
-	"static const float4 Co = float4( 1.5,  0.5, -0.5, 0.5 );\n"
-	"static const float4 Ax = float4( 1.0, -1.0, -1.0, 1.0 );\n"
-	"static const float4 Bx = float4( 0.5,  2.0, -0.5,-2.0 );\n"
-	"static const float4 Cx = float4( 1.0,  1.0, -0.5, 0.0 );\n"
-	"static const float4 Ay = float4( 1.0, -1.0, -1.0, 1.0 );\n"
-	"static const float4 By = float4( 2.0,  0.5, -2.0,-0.5 );\n"
-	"static const float4 Cy = float4( 2.0,  0.0, -1.0, 0.5 );\n"
-	"static const float4 Ci = float4(0.25, 0.25, 0.25, 0.25);\n"
-	"\n"
-	"// Difference between vector components.\n"
-	"float4 df(float4 A, float4 B)\n"
-	"{\n"
-	"	 return float4(abs(A-B));\n"
-	"}\n"
-	"\n"
-	"// Compare two vectors and return their components are different.\n"
-	"float4 diff(float4 A, float4 B)\n"
-	"{\n"
-	"	 return step(0.001, df(A, B));\n"
-	"}\n"
-	"\n"
-	"// Determine if two vector components are equal based on a threshold.\n"
-	"float4 eq(float4 A, float4 B)\n"
-	"{\n"
-	"	 return step(df(A, B), 15.);\n"
-	"}\n"
-	"\n"
-	"// Determine if two vector components are NOT equal based on a threshold.\n"
-	"float4 neq(float4 A, float4 B)\n"
-	"{\n"
-	"	 return step(15., df(A, B));\n"
-	"}\n"
-	"\n"
-	"// Weighted distance.\n"
-	"float4 wd(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h)\n"
-	"{\n"
-	"	 return (df(a,b) + df(a,c) + df(d,e) + df(d,f) + 4.0*df(g,h));\n"
-	"}\n"
-	"\n"
-	"float4 tex_sample(float2 coord)\n"
-	"{\n"
-	"	float2 tc = coord * u_texSize.xy;\n"
-	"	float2 fp = frac(tc);\n"
-	"	tc = floor(tc) + 0.5;\n"
-	"\n"
-	"	float4 xyp_01_02_03 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1., -2.));\n"
-	"	float4 xyp_06_07_08 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1., -1.));\n"
-	"	float4 xyp_11_12_13 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1.,  0.));\n"
-	"	float4 xyp_16_17_18 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1.,  1.));\n"
-	"	float4 xyp_21_22_23 = u_texSize.zzzw * (tc.xxxy + float4(-1.,  0., 1.,  2.));\n"
-	"	float4 xyp_05_10_15 = u_texSize.zwww * (tc.xyyy + float4(-2., -1., 0.,  1.));\n"
-	"	float4 xyp_09_14_09 = u_texSize.zwww * (tc.xyyy + float4( 2., -1., 0.,  1.));\n"
-	"\n"
-	"	float4 edri;\n"
-	"	float4 edr;\n"
-	"	float4 edr_l;\n"
-	"	float4 edr_u;\n"
-	"	float4 px; // px = pixel, edr = edge detection rule\n"
-	"	float4 irlv0;\n"
-	"	float4 irlv1;\n"
-	"	float4 irlv2l;\n"
-	"	float4 irlv2u;\n"
-	"	float4 block_3d;\n"
-	"	float4 fx;\n"
-	"	float4 fx_l;\n"
-	"	float4 fx_u; // inequations of straight lines.\n"
-	"\n"
-	"\n"
-	"	float4 A1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.xw));\n"
-	"	float4 B1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.yw));\n"
-	"	float4 C1 = premultiply_alpha(tex_sample_direct(xyp_01_02_03.zw));\n"
-	"	float4 A  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.xw));\n"
-	"	float4 B  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.yw));\n"
-	"	float4 C  = premultiply_alpha(tex_sample_direct(xyp_06_07_08.zw));\n"
-	"	float4 D  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.xw));\n"
-	"	float4 E  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.yw));\n"
-	"	float4 F  = premultiply_alpha(tex_sample_direct(xyp_11_12_13.zw));\n"
-	"	float4 G  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.xw));\n"
-	"	float4 H  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.yw));\n"
-	"	float4 I  = premultiply_alpha(tex_sample_direct(xyp_16_17_18.zw));\n"
-	"	float4 G5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.xw));\n"
-	"	float4 H5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.yw));\n"
-	"	float4 I5 = premultiply_alpha(tex_sample_direct(xyp_21_22_23.zw));\n"
-	"	float4 A0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xy));\n"
-	"	float4 D0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xz));\n"
-	"	float4 G0 = premultiply_alpha(tex_sample_direct(xyp_05_10_15.xw));\n"
-	"	float4 C4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xy));\n"
-	"	float4 F4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xz));\n"
-	"	float4 I4 = premultiply_alpha(tex_sample_direct(xyp_09_14_09.xw));\n"
-	"\n"
-	"	float4 b  = float4(dot(B.rgb ,rgbw), dot(D.rgb ,rgbw), dot(H.rgb ,rgbw), dot(F.rgb ,rgbw));\n"
-	"	float4 c  = float4(dot(C.rgb ,rgbw), dot(A.rgb ,rgbw), dot(G.rgb ,rgbw), dot(I.rgb ,rgbw));\n"
-	"	float4 d  = b.yzwx;\n"
-	"	float4 e  = dot(E.rgb,rgbw).xxxx;\n"
-	"	float4 f  = b.wxyz;\n"
-	"	float4 g  = c.zwxy;\n"
-	"	float4 h  = b.zwxy;\n"
-	"	float4 i  = c.wxyz;\n"
-	"	float4 i4 = float4(dot(I4.rgb,rgbw), dot(C1.rgb,rgbw), dot(A0.rgb,rgbw), dot(G5.rgb,rgbw));\n"
-	"	float4 i5 = float4(dot(I5.rgb,rgbw), dot(C4.rgb,rgbw), dot(A1.rgb,rgbw), dot(G0.rgb,rgbw));\n"
-	"	float4 h5 = float4(dot(H5.rgb,rgbw), dot(F4.rgb,rgbw), dot(B1.rgb,rgbw), dot(D0.rgb,rgbw));\n"
-	"	float4 f4 = h5.yzwx;\n"
-	"\n"
-	"	// These inequations define the line below which interpolation occurs.\n"
-	"	fx   = (Ao*fp.y+Bo*fp.x);\n"
-	"	fx_l = (Ax*fp.y+Bx*fp.x);\n"
-	"	fx_u = (Ay*fp.y+By*fp.x);\n"
-	"\n"
-	"	irlv0 = diff(e,f) * diff(e,h);\n"
-	"\n"
-//	"	irlv1 = irlv0;\n"
-//	"  irlv1 = (irlv0  * ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) );\n"
-//	"  float4 c1 = i4.yzwx;\n"
-//	"  float4 g0 = i5.wxyz;\n"
-//	"  irlv1 = (irlv0  * ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) * (diff(f,f4) * diff(f,i) + diff(h,h5) * diff(h,i) + diff(h,g) + diff(f,c) + eq(b,c1) * eq(d,g0)));\n"
-	"	irlv1 = (irlv0  * ( neq(f,b) * neq(f,c) + neq(h,d) * neq(h,g) + eq(e,i) * (neq(f,f4) * neq(f,i4) + neq(h,h5) * neq(h,i5)) + eq(e,g) + eq(e,c)) );\n"
-	"\n"
-	"	irlv2l = diff(e,g) * diff(d,g);\n"
-	"	irlv2u = diff(e,c) * diff(b,c);\n"
-	"\n"
-	"	float4 fx45i = clamp((fx   + delta   -Co - Ci)/(2.0*delta  ), 0.0, 1.0);\n"
-	"	float4 fx45  = clamp((fx   + delta   -Co     )/(2.0*delta  ), 0.0, 1.0);\n"
-	"	float4 fx30  = clamp((fx_l + delta_l -Cx     )/(2.0*delta_l), 0.0, 1.0);\n"
-	"	float4 fx60  = clamp((fx_u + delta_u -Cy     )/(2.0*delta_u), 0.0, 1.0);\n"
-	"\n"
-	"	float4 wd1 = wd( e, c,  g, i, h5, f4, h, f);\n"
-	"	float4 wd2 = wd( h, d, i5, f, i4,  b, e, i);\n"
-	"\n"
-	"	edri  = step(wd1, wd2) * irlv0;\n"
-	"	edr   = step(wd1 + float4(0.1, 0.1, 0.1, 0.1), wd2) * step(float4(0.5, 0.5, 0.5, 0.5), irlv1);\n"
-	"	edr_l = step( 2.*df(f,g), df(h,c) ) * irlv2l * edr;\n"
-	"	edr_u = step( 2.*df(h,c), df(f,g) ) * irlv2u * edr;\n"
-	"\n"
-	"	fx45  = edr   * fx45;\n"
-	"	fx30  = edr_l * fx30;\n"
-	"	fx60  = edr_u * fx60;\n"
-	"	fx45i = edri  * fx45i;\n"
-	"\n"
-	"	px = step(df(e,f), df(e,h));\n"
-	"\n"
-	"	float4 maximos = max(max(fx30, fx60), max(fx45, fx45i));\n"
-//	"  float4 maximos = max(max(fx30, fx60), fx45);"
-	"\n"
-	"	float4 res1 = E;\n"
-	"	res1 = lerp(res1, lerp(H, F, px.x), maximos.x);\n"
-	"	res1 = lerp(res1, lerp(B, D, px.z), maximos.z);\n"
-	"\n"
-	"	float4 res2 = E;\n"
-	"	res2 = lerp(res2, lerp(F, B, px.y), maximos.y);\n"
-	"	res2 = lerp(res2, lerp(D, H, px.w), maximos.w);\n"
-	"\n"
-	"	float4 res = lerp(res1, res2, step(c_df(E.rgb, res1.rgb), c_df(E.rgb, res2.rgb)));\n"
-	"  return postdivide_alpha(res);\n"
-	"}\n";
-static const char* sampler_sabr =
-	"float c_df(float4 c1, float4 c2) {\n"
-	"	float3 df = abs(c1.rgb - c2.rgb);\n"
-	"	return df.r + df.g + df.b;\n"
-	"}\n"
-	"static const  float4 Ai  = float4( 1.0, -1.0, -1.0,  1.0);\n"
-	"static const  float4 B45 = float4( 1.0,  1.0, -1.0, -1.0);\n"
-	"static const  float4 C45 = float4( 1.5,  0.5, -0.5,  0.5);\n"
-	"static const  float4 M45 = float4(0.4, 0.4, 0.4, 0.4);\n"
-	"static const  float4 M30 = float4(0.2, 0.4, 0.2, 0.4);\n"
-	"static const  float3 lum = float3(0.21, 0.72, 0.07);\n"
-	"\n"
-	"float lum_to(float4 v) {\n"
-	"  return dot(lum, v.rgb);\n"
-	"}\n"
-	"float4 lum_to(float4 v0, float4 v1, float4 v2, float4 v3) {\n"
-	"  return float4(lum_to(v0), lum_to(v1), lum_to(v2), lum_to(v3));\n"
-	"}\n"
-	"\n"
-	"\n"
-	"float4 tex_sample(float2 coord)\n"
-	"{\n"
-	"/*\n"
-	"	Mask for algorithm\n"
-	"	+-----+-----+-----+\n"
-	"	|     |  7  |     |\n"
-	"	+-----+-----+-----+\n"
-	"	| 11  | 12  | 13  |\n"
-	"	+-----+-----+-----+\n"
-	"	|     | 17  |     |\n"
-	"	+-----+-----+-----+\n"
-	"*/\n"
-	"// Store mask values\n"
-	"  float4 P07 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0, -1.0)));\n"
-	"  float4 P11 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2(-1.0,  0.0)));\n"
-	"  float4 P12 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  0.0)));\n"
-	"  float4 P13 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 1.0,  0.0)));\n"
-	"  float4 P17 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  1.0)));\n"
-	"\n"
-	"  // Store luminance values of each point\n"
-	"  float4 p7  = lum_to(P07, P11, P17, P13);\n"
-	"  float4 p12 = lum_to(P12);\n"
-	"  float4 p13 = p7.wxyz; // P13, P7,  P11, P17\n"
-	"  float4 p17 = p7.zwxy; // P11, P17, P13, P7\n"
-	"\n"
-	"  float2 fp = frac(coord * u_texSize.xy);\n"
-	"  float4 ma45 = smoothstep(C45 - M45, C45 + M45, Ai * fp.y + B45 * fp.x);\n"
-	"  float4 px = step(abs(p12 - p17), abs(p12 - p13));\n"
-	"\n"
-	"  float4 res1 = P12;\n"
-	"  res1 = lerp(res1, lerp(P13, P17, px.x), ma45.x);\n"
-	"  res1 = lerp(res1, lerp(P07, P13, px.y), ma45.y);\n"
-	"  res1 = lerp(res1, lerp(P11, P07, px.z), ma45.z);\n"
-	"  res1 = lerp(res1, lerp(P17, P11, px.w), ma45.w);\n"
-	"\n"
-	"  float4 res2 = P12;\n"
-	"  res2 = lerp(res2, lerp(P17, P11, px.w), ma45.w);\n"
-	"  res2 = lerp(res2, lerp(P11, P07, px.z), ma45.z);\n"
-	"  res2 = lerp(res2, lerp(P07, P13, px.y), ma45.y);\n"
-	"  res2 = lerp(res2, lerp(P13, P17, px.x), ma45.x);\n"
-	"\n"
-	"  float4 res = lerp(res1, res2, step(c_df(P12, res1), c_df(P12, res2)));\n"
-	"  return postdivide_alpha(res);\n"
-	"}\n";
-
-static const char* sampler_hybrid = sampler_xbrz;
-static const char* sampler_hybrid_bicubic = sampler_xbrz;
-static const char* sampler_bicubic =
-	"// generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B\n"
-	"// B=1 C=0   : cubic B spline (very smooth)\n"
-	"// B=C=1/3   : recommended for general upscaling\n"
-	"// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)\n"
-	"// see Mitchell & Netravali, \"Reconstruction Filters in Computer Graphics\"\n"
-	"\n"
-	"//#define BSPLINE\n"
-	"#ifdef BSPLINE\n"
-	"  static const float B = 1.0f;\n"
-	"  static const float C = 0.0f;\n"
-	"#else\n"
-	"  static const float B = 1.0f / 3.0f;\n"
-	"  static const float C = 1.0f / 3.0f;\n"
-	"#endif\n"
-	"float mitchell(float2 pos) {\n"
-	"  float x = sqrt(dot(pos, pos));\n"
-	"	return\n"
-	"    step(x, 2.0) * \n"
-	"    (step(1.0, x) * ((-B - 6 * C)*(x*x*x) + (6 * B + 30 * C)*(x*x) + (-12 * B - 48 * C)*x + (8 * B + 24 * C)) +\n"
-	"     step(x, 1.0) * ((12 - 9 * B - 6 * C)*(x*x*x) + (-18 + 12 * B + 6 * C)*(x*x) + (6 - 2 * B)))\n"
-	"    / 6.0f;\n"
-	"}\n"
-	"float4 tex_sample(float2 coord) {\n"
-	"  float2 offset = frac(coord * u_texSize.xy) - 0.5;\n"
-	"  float4 tempColor = 0.0;\n"
-	"  float4 c;\n"
-	"  float i,j;\n"
-	"  float2 pos;\n"
-	"  for (i = -2.0; i < 3.0; i++){\n"
-	"    pos.x = offset.x - i;\n"
-	"    for (j = -2.0; j < 3.0 ;j++){\n"
-	"      pos.y = offset.y - j;\n"
-	"      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));\n"
-	"      tempColor+=c*mitchell(pos);\n"
-	"    }\n"
-	"  }\n"
-	"  return postdivide_alpha(tempColor);\n"
-	"};\n";
-
 // Missing: Z depth range
 // Also, logic ops etc, of course, as they are not supported in DX9.
-bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguage lang) {
+bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, char* scalerCode, ShaderLanguage lang) {
 	char *p = buffer;
 
 	bool lmode = id.Bit(FS_BIT_LMODE);
@@ -396,7 +56,6 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
 	GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
 	GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
 	bool needShaderTexClamp = id.Bit(FS_BIT_SHADER_TEX_CLAMP);
-	bool needScaleFilter = g_Config.bRealtimeTexScaling && g_Config.iTexScalingLevel != 1 && !gstate_c.curTextureIsRT;
 
 	ReplaceBlendType replaceBlend = static_cast<ReplaceBlendType>(id.Bits(FS_BIT_REPLACE_BLEND, 3));
 	ReplaceAlphaType stencilToAlpha = static_cast<ReplaceAlphaType>(id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2));
@@ -495,7 +154,7 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
 		if (doTextureAlpha) {
 			// TODO: check why the [0.0,1.0] clamp is necessary here
 			WRITE(p, "float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }");
-			WRITE(p, "float4 postdivide_alpha(float4 c) { return c.a < 0.001f? 0.0f : float4(c.rgb / c.a, c.a); }\n");
+			WRITE(p, "float4 postdivide_alpha(float4 c) { return c.a < 0.0001f? 0.0f : float4(c.rgb / c.a, c.a); }\n");
 		} else {
 			WRITE(p, "#define premultiply_alpha(c) (c)\n");
 			WRITE(p, "#define postdivide_alpha(c) (c)\n");
@@ -509,35 +168,10 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguag
 		}
 		WRITE(p, "};\n");
 
-		if (needScaleFilter) {
-			switch (g_Config.iTexScalingType) {
-			case TextureScalerCommon::XBRZ:
-				WRITE(p, sampler_xbrz);
-				break;
-			case TextureScalerCommon::HYBRID:
-				WRITE(p, sampler_hybrid);
-				break;
-			case TextureScalerCommon::BICUBIC:
-				WRITE(p, sampler_bicubic);
-				break;
-			case TextureScalerCommon::HYBRID_BICUBIC:
-				WRITE(p, sampler_hybrid_bicubic);
-				break;
-			case TextureScalerCommon::SABR:
-				WRITE(p, sampler_sabr);
-				break;
-			case TextureScalerCommon::GAUSSIAN:
-				WRITE(p, sampler_gaussian);
-				break;
-			case TextureScalerCommon::COSINE:
-				WRITE(p, sampler_cosine);
-				break;
-			default:
-				ERROR_LOG(G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
-				break;
-			}
+		if (scalerCode && !gstate_c.curTextureIsRT) {
+			WRITE(p, scalerCode);
 		} else {
-			WRITE(p, sampler_default);
+			WRITE(p, "#define tex_sample(x) tex_sample_direct(x)\n");
 		}
 	}
 
diff --git a/GPU/Directx9/PixelShaderGeneratorDX9.h b/GPU/Directx9/PixelShaderGeneratorDX9.h
index f845f5dcf8cc..4779d9f10317 100644
--- a/GPU/Directx9/PixelShaderGeneratorDX9.h
+++ b/GPU/Directx9/PixelShaderGeneratorDX9.h
@@ -22,7 +22,7 @@
 
 namespace DX9 {
 
-bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, ShaderLanguage lang = HLSL_DX9);
+bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, char *scalerCode = nullptr, ShaderLanguage lang = HLSL_DX9);
 
 #define CONST_PS_TEXENV 0
 #define CONST_PS_ALPHACOLORREF 1
diff --git a/GPU/Directx9/ShaderManagerDX9.cpp b/GPU/Directx9/ShaderManagerDX9.cpp
index b0a99e93dc69..f7634dadf5c6 100644
--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@@ -476,7 +476,7 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 }
 
 ShaderManagerDX9::ShaderManagerDX9(LPDIRECT3DDEVICE9 device) : device_(device), lastVShader_(nullptr), lastPShader_(nullptr) {
-	codeBuffer_ = new char[16384];
+	codeBuffer_ = new char[16384 * 2];
 }
 
 ShaderManagerDX9::~ShaderManagerDX9() {
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index bd7e37ff5eaf..5b297ff8a446 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -378,7 +378,7 @@ void GameSettingsScreen::CreateViews() {
 	});
 	texScalingChoice->SetDisabledPtr(&g_Config.bSoftwareRendering);
 
-	static const char *texScaleAlgos[] = { "xBRZ", "Hybrid", "Bicubic", "Hybrid + Bicubic", "SABR", "Gaussian", "Cosine"};
+	static const char *texScaleAlgos[] = { "xBRZ", "Hybrid", "Bicubic", "Hybrid + Bicubic", "XBR", "SABR", "Gaussian", "Cosine"};
 	PopupMultiChoice *texScalingType = graphicsSettings->Add(new PopupMultiChoice(&g_Config.iTexScalingType, gr->T("Upscale Type"), texScaleAlgos, 0, ARRAY_SIZE(texScaleAlgos), gr->GetName(), screenManager()));
 	texScalingType->SetDisabledPtr(&g_Config.bSoftwareRendering);
 
diff --git a/Windows/MainWindowMenu.cpp b/Windows/MainWindowMenu.cpp
index 8aba6ff3e26c..73a4ef571cae 100644
--- a/Windows/MainWindowMenu.cpp
+++ b/Windows/MainWindowMenu.cpp
@@ -787,6 +787,7 @@ namespace MainWindow {
 		case ID_TEXTURESCALING_HYBRID:          setTexScalingType(TextureScalerCommon::HYBRID); break;
 		case ID_TEXTURESCALING_BICUBIC:         setTexScalingType(TextureScalerCommon::BICUBIC); break;
 		case ID_TEXTURESCALING_HYBRID_BICUBIC:  setTexScalingType(TextureScalerCommon::HYBRID_BICUBIC); break;
+		case ID_TEXTURESCALING_XBR:             setTexScalingType(TextureScalerCommon::XBR); break;
 		case ID_TEXTURESCALING_SABR:            setTexScalingType(TextureScalerCommon::SABR); break;
 		case ID_TEXTURESCALING_GAUSSIAN:        setTexScalingType(TextureScalerCommon::GAUSSIAN); break;
 		case ID_TEXTURESCALING_COSINE:          setTexScalingType(TextureScalerCommon::COSINE); break;
@@ -1214,6 +1215,7 @@ namespace MainWindow {
 			ID_TEXTURESCALING_HYBRID,
 			ID_TEXTURESCALING_BICUBIC,
 			ID_TEXTURESCALING_HYBRID_BICUBIC,
+			ID_TEXTURESCALING_XBR,
 			ID_TEXTURESCALING_SABR,
 			ID_TEXTURESCALING_GAUSSIAN,
 			ID_TEXTURESCALING_COSINE,
diff --git a/Windows/ppsspp.rc b/Windows/ppsspp.rc
index 53bbe7e184d2..47a74e494e17 100644
--- a/Windows/ppsspp.rc
+++ b/Windows/ppsspp.rc
@@ -613,6 +613,7 @@ BEGIN
             MENUITEM "Hybrid",                          ID_TEXTURESCALING_HYBRID
             MENUITEM "Bicubic",                         ID_TEXTURESCALING_BICUBIC
             MENUITEM "Hybrid + Bicubic",                ID_TEXTURESCALING_HYBRID_BICUBIC
+            MENUITEM "XBR",                             ID_TEXTURESCALING_XBR
             MENUITEM "SABR",                            ID_TEXTURESCALING_SABR
             MENUITEM "Gaussian",                        ID_TEXTURESCALING_GAUSSIAN
             MENUITEM "Cosine",                          ID_TEXTURESCALING_COSINE
diff --git a/Windows/resource.h b/Windows/resource.h
index bdf1308100de..9a8c9f763d8e 100644
--- a/Windows/resource.h
+++ b/Windows/resource.h
@@ -342,9 +342,10 @@
 
 #define ID_TEXTURESCALING_REALTIME       40177
 #define ID_TEXTURESCALING_REALTIME_HC    40178
-#define ID_TEXTURESCALING_SABR           40179
-#define ID_TEXTURESCALING_GAUSSIAN       40180
-#define ID_TEXTURESCALING_COSINE         40181
+#define ID_TEXTURESCALING_XBR            40179
+#define ID_TEXTURESCALING_SABR           40180
+#define ID_TEXTURESCALING_GAUSSIAN       40181
+#define ID_TEXTURESCALING_COSINE         40182
 
 // Dummy option to let the buffered rendering hotkey cycle through all the options.
 #define ID_OPTIONS_BUFFEREDRENDERINGDUMMY 40500
diff --git a/assets/scalers/bicubic.hlsl b/assets/scalers/bicubic.hlsl
new file mode 100644
index 000000000000..8a20886e9d04
--- /dev/null
+++ b/assets/scalers/bicubic.hlsl
@@ -0,0 +1,49 @@
+
+// generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
+// B=1 C=0   : cubic B spline (very smooth)
+// B=C=1/3   : recommended for general upscaling
+// B=0 C=1/2 : Catmull-Rom spline (sharp, ringing)
+// see Mitchell & Netravali, \Reconstruction Filters in Computer Graphics\
+
+//#define BSPLINE
+#ifdef BSPLINE
+  static const float B = 1.0f;
+  static const float C = 0.0f;
+#else
+  static const float B = 1.0f / 3.0f;
+  static const float C = 1.0f / 3.0f;
+#endif
+
+float mitchell_0_1(float x) {
+	return ((12 - 9 * B - 6 * C)*(x*x*x) + (-18 + 12 * B + 6 * C)*(x*x) + (6 - 2 * B)) / 6.0f;
+}
+
+float mitchell_1_2(float x) {
+	return ((-B - 6 * C)*(x*x*x) + (6 * B + 30 * C)*(x*x) + (-12 * B - 48 * C)*x + (8 * B + 24 * C)) / 6.0f;
+}
+
+float mitchell(float2 pos) {
+  float x = sqrt(dot(pos, pos));
+//  return lerp(mitchell_0_1(x), mitchell_1_2(x), step(1.0,x)) * step(x, 2.0);
+  if (x < 1.0)
+    return mitchell_0_1(x);
+  if (x < 2.0)
+    return mitchell_1_2(x);
+  return 0.0;
+}
+float4 tex_sample(float2 coord) {
+  float2 offset = frac(coord * u_texSize.xy) - 0.5;
+  float4 tempColor = 0.0;
+  float4 c;
+  float i,j;
+  float2 pos;
+  for (i = -2.0; i < 3.0; i++){
+    pos.x = offset.x - i;
+    for (j = -2.0; j < 3.0 ;j++){
+      pos.y = offset.y - j;
+      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));
+      tempColor+=c*mitchell(pos);
+    }
+  }
+  return postdivide_alpha(tempColor);
+}
diff --git a/assets/scalers/cosine.hlsl b/assets/scalers/cosine.hlsl
new file mode 100644
index 000000000000..2481529f940a
--- /dev/null
+++ b/assets/scalers/cosine.hlsl
@@ -0,0 +1,24 @@
+
+#define sharpness 1.0
+#define pi 3.14159265358
+#define a(x) abs(x)
+#define d(x,b) (pi*b*min(a(x)+0.5,1.0/b))
+#define e(x,b) (pi*b*min(max(a(x)-0.5,-1.0/b),1.0/b))
+#define KERNEL(x,b) ((d(x,b)+sin(d(x,b))-e(x,b)-sin(e(x,b)))/(2.0*pi))
+
+float4 tex_sample(float2 coord) {
+  float2 offset = frac(coord * u_texSize.xy) - 0.5;
+  float4 tempColor = 0.0;
+  float4 c;
+  float i,j;
+  float2 pos;
+  for (i = -2.0; i < 2.0; i++){
+    pos.x = offset.x - i;
+    for (j = -2.0; j< 2.0 ;j++){
+      pos.y = offset.y - j;
+      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));
+      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);
+    }
+  }
+  return postdivide_alpha(tempColor);
+}
diff --git a/assets/scalers/gaussian.hlsl b/assets/scalers/gaussian.hlsl
new file mode 100644
index 000000000000..f57cd4ddcf58
--- /dev/null
+++ b/assets/scalers/gaussian.hlsl
@@ -0,0 +1,33 @@
+
+#define sharpness 1.0
+#define pi 3.14159265358
+#define normalGauss(x) ((exp(-(x)*(x)*0.5))/sqrt(2.0*pi))
+#define normalGauss2(x) (normalGauss(x - 0.5) - 0.5)
+float normalGaussIntegral(float x)
+{
+	 float a1 = 0.4361836;
+	 float a2 = -0.1201676;
+	 float a3 = 0.9372980;
+	 float p = 0.3326700;
+	 float t = 1.0 / (1.0 + p*abs(x));
+
+	 return (0.5-normalGauss(x) * (t*(a1 + t*(a2 + a3*t))))*sign(x);
+}
+#define KERNEL(x,b) (normalGaussIntegral(sqrt(2*pi)*b*(x - 0.5)) - normalGaussIntegral(sqrt(2*pi)*b*(x + 0.5)))
+
+float4 tex_sample(float2 coord) {
+  float2 offset = frac(coord * u_texSize.xy) - 0.5;
+  float4 tempColor = 0.0;
+  float4 c;
+  float i,j;
+  float2 pos;
+  for (i = -2.0; i < 2.0; i++){
+    pos.x = offset.x - i;
+    for (j = -2.0; j< 2.0 ;j++){
+      pos.y = offset.y - j;
+      c=premultiply_alpha(tex_sample_direct(coord - pos * u_texSize.zw));
+      tempColor+=c*KERNEL(pos.x,sharpness)*KERNEL(pos.y,sharpness);
+    }
+  }
+  return postdivide_alpha(tempColor);
+}
diff --git a/assets/scalers/hybrid.hlsl b/assets/scalers/hybrid.hlsl
new file mode 100644
index 000000000000..f03a50302933
--- /dev/null
+++ b/assets/scalers/hybrid.hlsl
@@ -0,0 +1,4 @@
+
+float4 tex_sample(float2 coord) {
+  return tex_sample_direct(coord);
+};
diff --git a/assets/scalers/hybrid_bicubic.hlsl b/assets/scalers/hybrid_bicubic.hlsl
new file mode 100644
index 000000000000..f03a50302933
--- /dev/null
+++ b/assets/scalers/hybrid_bicubic.hlsl
@@ -0,0 +1,4 @@
+
+float4 tex_sample(float2 coord) {
+  return tex_sample_direct(coord);
+};
diff --git a/assets/scalers/sabr.hlsl b/assets/scalers/sabr.hlsl
new file mode 100644
index 000000000000..05d0222e694e
--- /dev/null
+++ b/assets/scalers/sabr.hlsl
@@ -0,0 +1,64 @@
+
+float c_df(float4 c1, float4 c2) {
+	float3 df = abs(c1.rgb - c2.rgb);
+	return df.r + df.g + df.b;
+}
+static const  float4 Ai  = float4( 1.0, -1.0, -1.0,  1.0);
+static const  float4 B45 = float4( 1.0,  1.0, -1.0, -1.0);
+static const  float4 C45 = float4( 1.5,  0.5, -0.5,  0.5);
+static const  float4 M45 = float4(0.4, 0.4, 0.4, 0.4);
+static const  float4 M30 = float4(0.2, 0.4, 0.2, 0.4);
+static const  float3 lum = float3(0.21, 0.72, 0.07);
+
+float lum_to(float4 v) {
+  return dot(lum, v.rgb);
+}
+float4 lum_to(float4 v0, float4 v1, float4 v2, float4 v3) {
+  return float4(lum_to(v0), lum_to(v1), lum_to(v2), lum_to(v3));
+}
+
+
+float4 tex_sample(float2 coord)
+{
+/*
+	Mask for algorithm
+	+-----+-----+-----+
+	|     |  7  |     |
+	+-----+-----+-----+
+	| 11  | 12  | 13  |
+	+-----+-----+-----+
+	|     | 17  |     |
+	+-----+-----+-----+
+*/
+// Store mask values
+  float4 P07 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0, -1.0)));
+  float4 P11 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2(-1.0,  0.0)));
+  float4 P12 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  0.0)));
+  float4 P13 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 1.0,  0.0)));
+  float4 P17 = premultiply_alpha(tex_sample_direct(coord + u_texSize.zw * float2( 0.0,  1.0)));
+
+  // Store luminance values of each point
+  float4 p7  = lum_to(P07, P11, P17, P13);
+  float4 p12 = lum_to(P12);
+  float4 p13 = p7.wxyz; // P13, P7,  P11, P17
+  float4 p17 = p7.zwxy; // P11, P17, P13, P7
+
+  float2 fp = frac(coord * u_texSize.xy);
+  float4 ma45 = smoothstep(C45 - M45, C45 + M45, Ai * fp.y + B45 * fp.x);
+  float4 px = step(abs(p12 - p17), abs(p12 - p13));
+
+  float4 res1 = P12;
+  res1 = lerp(res1, lerp(P13, P17, px.x), ma45.x);
+  res1 = lerp(res1, lerp(P07, P13, px.y), ma45.y);
+  res1 = lerp(res1, lerp(P11, P07, px.z), ma45.z);
+  res1 = lerp(res1, lerp(P17, P11, px.w), ma45.w);
+
+  float4 res2 = P12;
+  res2 = lerp(res2, lerp(P17, P11, px.w), ma45.w);
+  res2 = lerp(res2, lerp(P11, P07, px.z), ma45.z);
+  res2 = lerp(res2, lerp(P07, P13, px.y), ma45.y);
+  res2 = lerp(res2, lerp(P13, P17, px.x), ma45.x);
+
+  float4 res = lerp(res1, res2, step(c_df(P12, res1), c_df(P12, res2)));
+  return postdivide_alpha(res);
+}
diff --git a/assets/scalers/xbr.hlsl b/assets/scalers/xbr.hlsl
new file mode 100644
index 000000000000..546831f36c1a
--- /dev/null
+++ b/assets/scalers/xbr.hlsl
@@ -0,0 +1,210 @@
+/*
+   Hyllian's xBR-lv2 Shader
+
+   Copyright (C) 2011-2015 Hyllian - sergiogdb@gmail.com
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to permit
+   persons to whom the Software is furnished to do so, subject to the
+   following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+   THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   Incorporates some of the ideas from SABR shader. Thanks to Joshua Street.
+*/
+
+#define XBR_Y_WEIGHT          48.0 //  0.0 .. 100.0
+#define XBR_EQ_THRESHOLD      15.0 //  0.0 ..  50.0
+#define XBR_LV1_COEFFICIENT    0.5 //  0.0 ..  30.0
+#define XBR_LV2_COEFFICIENT    2.0 //  1.0 ..   3.0
+//#define SMALL_DETAILS
+
+#define CORNER_TYPE 3 // 1 .. 4
+#define XBR_SCALE 4.0
+
+static const float coef           = 2.0;
+static const float3  rgbw         = float3(14.352, 28.176, 5.472);
+static const float4  eq_threshold = float4(15.0, 15.0, 15.0, 15.0);
+
+static const float4 delta   = float4(1.0/XBR_SCALE, 1.0/XBR_SCALE, 1.0/XBR_SCALE, 1.0/XBR_SCALE);
+static const float4 delta_l = float4(0.5/XBR_SCALE, 1.0/XBR_SCALE, 0.5/XBR_SCALE, 1.0/XBR_SCALE);
+static const float4 delta_u = delta_l.yxwz;
+
+static const float4 Ao = float4( 1.0, -1.0, -1.0, 1.0 );
+static const float4 Bo = float4( 1.0,  1.0, -1.0,-1.0 );
+static const float4 Co = float4( 1.5,  0.5, -0.5, 0.5 );
+static const float4 Ax = float4( 1.0, -1.0, -1.0, 1.0 );
+static const float4 Bx = float4( 0.5,  2.0, -0.5,-2.0 );
+static const float4 Cx = float4( 1.0,  1.0, -0.5, 0.0 );
+static const float4 Ay = float4( 1.0, -1.0, -1.0, 1.0 );
+static const float4 By = float4( 2.0,  0.5, -2.0,-0.5 );
+static const float4 Cy = float4( 2.0,  0.0, -1.0, 0.5 );
+static const float4 Ci = float4(0.25, 0.25, 0.25, 0.25);
+
+static const float3 Y = float3(0.2126, 0.7152, 0.0722);
+
+// Difference between vector components.
+float4 df(float4 A, float4 B) { return abs(A-B); }
+
+// Compare two vectors and return their components are different.
+float4 diff(float4 A, float4 B) { return step(0.001, df(A, B)); }
+
+// Determine if two vector components are equal based on a threshold.
+float4 eq(float4 A, float4 B) { return (step(df(A, B), XBR_EQ_THRESHOLD)); }
+
+// Determine if two vector components are NOT equal based on a threshold.
+float4 neq(float4 A, float4 B) { return (1.0 - eq(A, B)); }
+
+// Weighted distance.
+float4 wd(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h) {
+  return (df(a,b) + df(a,c) + df(d,e) + df(d,f) + 4.0*df(g,h));
+}
+
+float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i, float4 j, float4 k, float4 l) {
+	return (df(a,b) + df(a,c) + df(d,e) + df(d,f) + df(i,j) + df(k,l) + 2.0*df(g,h));
+}
+
+float c_df(float3 c1, float3 c2) {
+  float3 df = abs(c1 - c2);
+  return df.r + df.g + df.b;
+}
+
+float4 tex_sample(float2 coord) {
+  float dx = u_texSize.z;
+  float dy = u_texSize.w;
+
+  float4 t1 = coord.xxxy + float4( -dx, 0, dx,-2.0*dy); // A1 B1 C1
+  float4 t2 = coord.xxxy + float4( -dx, 0, dx,    -dy); //  A  B  C
+  float4 t3 = coord.xxxy + float4( -dx, 0, dx,      0); //  D  E  F
+  float4 t4 = coord.xxxy + float4( -dx, 0, dx,     dy); //  G  H  I
+  float4 t5 = coord.xxxy + float4( -dx, 0, dx, 2.0*dy); // G5 H5 I5
+  float4 t6 = coord.xyyy + float4(-2.0*dx,-dy, 0,  dy); // A0 D0 G0
+  float4 t7 = coord.xyyy + float4( 2.0*dx,-dy, 0,  dy); // C4 F4 I4
+  float4 edri, edr, edr_l, edr_u, px; // px = pixel, edr = edge detection rule
+  float4 irlv0, irlv1, irlv2l, irlv2u, block_3d;
+  float4 fx, fx_l, fx_u; // inequations of straight lines.
+
+  float2 fp  = frac(coord*u_texSize.xy);
+
+  float4 A1 = premultiply_alpha(tex_sample_direct(t1.xw));
+  float4 B1 = premultiply_alpha(tex_sample_direct(t1.yw));
+  float4 C1 = premultiply_alpha(tex_sample_direct(t1.zw));
+  float4 A  = premultiply_alpha(tex_sample_direct(t2.xw));
+  float4 B  = premultiply_alpha(tex_sample_direct(t2.yw));
+  float4 C  = premultiply_alpha(tex_sample_direct(t2.zw));
+  float4 D  = premultiply_alpha(tex_sample_direct(t3.xw));
+  float4 E  = premultiply_alpha(tex_sample_direct(t3.yw));
+  float4 F  = premultiply_alpha(tex_sample_direct(t3.zw));
+  float4 G  = premultiply_alpha(tex_sample_direct(t4.xw));
+  float4 H  = premultiply_alpha(tex_sample_direct(t4.yw));
+  float4 I  = premultiply_alpha(tex_sample_direct(t4.zw));
+  float4 G5 = premultiply_alpha(tex_sample_direct(t5.xw));
+  float4 H5 = premultiply_alpha(tex_sample_direct(t5.yw));
+  float4 I5 = premultiply_alpha(tex_sample_direct(t5.zw));
+  float4 A0 = premultiply_alpha(tex_sample_direct(t6.xy));
+  float4 D0 = premultiply_alpha(tex_sample_direct(t6.xz));
+  float4 G0 = premultiply_alpha(tex_sample_direct(t6.xw));
+  float4 C4 = premultiply_alpha(tex_sample_direct(t7.xy));
+  float4 F4 = premultiply_alpha(tex_sample_direct(t7.xz));
+  float4 I4 = premultiply_alpha(tex_sample_direct(t7.xw));
+
+  float4 b  = float4(dot(B.rgb ,rgbw), dot(D.rgb ,rgbw), dot(H.rgb ,rgbw), dot(F.rgb ,rgbw));
+  float4 c  = float4(dot(C.rgb ,rgbw), dot(A.rgb ,rgbw), dot(G.rgb ,rgbw), dot(I.rgb ,rgbw));
+  float4 d  = b.yzwx;
+  float4 e  = dot(E.rgb,rgbw);
+  float4 f  = b.wxyz;
+  float4 g  = c.zwxy;
+  float4 h  = b.zwxy;
+  float4 i  = c.wxyz;
+
+  float4 i4, i5, h5, f4;
+
+  float y_weight = XBR_Y_WEIGHT;
+#ifdef SMALL_DETAILS
+  i4 = mul(float4x3(I4.rgb, C1.rgb, A0.rgb, G5.rgb), y_weight * Y);
+  i5 = mul(float4x3(I5.rgb, C4.rgb, A1.rgb, G0.rgb), y_weight * Y);
+  h5 = mul(float4x3(H5.rgb, F4.rgb, B1.rgb, D0.rgb), y_weight * Y);
+#else
+  i4 = float4(dot(I4.rgb,rgbw), dot(C1.rgb,rgbw), dot(A0.rgb,rgbw), dot(G5.rgb,rgbw));
+  i5 = float4(dot(I5.rgb,rgbw), dot(C4.rgb,rgbw), dot(A1.rgb,rgbw), dot(G0.rgb,rgbw));
+  h5 = float4(dot(H5.rgb,rgbw), dot(F4.rgb,rgbw), dot(B1.rgb,rgbw), dot(D0.rgb,rgbw));
+#endif
+  f4 = h5.yzwx;
+
+  // These inequations define the line below which interpolation occurs.
+  fx   = (Ao*fp.y+Bo*fp.x);
+  fx_l = (Ax*fp.y+Bx*fp.x);
+  fx_u = (Ay*fp.y+By*fp.x);
+
+  irlv1 = irlv0 = diff(e,f) * diff(e,h);
+
+#if CORNER_TYPE == 1
+#define SMOOTH_TIPS
+#elif CORNER_TYPE == 2
+  irlv1      = (irlv0 * ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) );
+#elif CORNER_TYPE == 3
+  irlv1     = (irlv0  * ( neq(f,b) * neq(f,c) + neq(h,d) * neq(h,g) + eq(e,i) * (neq(f,f4) * neq(f,i4) + neq(h,h5) * neq(h,i5)) + eq(e,g) + eq(e,c)) );
+#else // CORNER_TYPE == 4
+  float4 c1 = i4.yzwx;
+  float4 g0 = i5.wxyz;
+  irlv1     = (irlv0  *  ( neq(f,b) * neq(h,d) + eq(e,i) * neq(f,i4) * neq(h,i5) + eq(e,g) + eq(e,c) ) * (diff(f,f4) * diff(f,i) + diff(h,h5) * diff(h,i) + diff(h,g) + diff(f,c) + eq(b,c1) * eq(d,g0)));
+#endif
+
+  irlv2l = diff(e,g) * diff(d,g);
+  irlv2u = diff(e,c) * diff(b,c);
+
+  float4 fx45i = clamp((fx   + delta   -Co - Ci)/(2.0*delta  ), 0.0, 1.0);
+  float4 fx45  = clamp((fx   + delta   -Co     )/(2.0*delta  ), 0.0, 1.0);
+  float4 fx30  = clamp((fx_l + delta_l -Cx     )/(2.0*delta_l), 0.0, 1.0);
+  float4 fx60  = clamp((fx_u + delta_u -Cy     )/(2.0*delta_u), 0.0, 1.0);
+
+  float4 wd1, wd2;
+#ifdef SMALL_DETAILS
+  wd1 = weighted_distance( e, c, g, i, f4, h5, h, f, b, d, i4, i5);
+  wd2 = weighted_distance( h, d, i5, f, b, i4, e, i, g, h5, c, f4);
+#else
+  wd1 = wd( e, c,  g, i, h5, f4, h, f);
+  wd2 = wd( h, d, i5, f, i4,  b, e, i);
+#endif
+
+  edri  = step(wd1, wd2) * irlv0;
+  edr   = step(wd1 + float4(0.1, 0.1, 0.1, 0.1), wd2) * step(float4(0.5, 0.5, 0.5, 0.5), irlv1);
+  edr_l = step( XBR_LV2_COEFFICIENT*df(f,g), df(h,c) ) * irlv2l * edr;
+  edr_u = step( XBR_LV2_COEFFICIENT*df(h,c), df(f,g) ) * irlv2u * edr;
+
+  fx45  = edr   * fx45;
+  fx30  = edr_l * fx30;
+  fx60  = edr_u * fx60;
+  fx45i = edri  * fx45i;
+
+  px = step(df(e,f), df(e,h));
+
+#ifdef SMOOTH_TIPS
+  float4 maximos = max(max(fx30, fx60), max(fx45, fx45i));
+#else
+  float4 maximos = max(max(fx30, fx60), fx45);
+#endif
+
+  float4 res1 = E;
+  res1 = lerp(res1, lerp(H, F, px.x), maximos.x);
+  res1 = lerp(res1, lerp(B, D, px.z), maximos.z);
+
+  float4 res2 = E;
+  res2 = lerp(res2, lerp(F, B, px.y), maximos.y);
+  res2 = lerp(res2, lerp(D, H, px.w), maximos.w);
+
+  float4 res = lerp(res1, res2, step(c_df(E.rgb, res1.rgb), c_df(E.rgb, res2.rgb)));
+  return postdivide_alpha(res);
+}
diff --git a/assets/scalers/xbrz.hlsl b/assets/scalers/xbrz.hlsl
new file mode 100644
index 000000000000..43745eb453ca
--- /dev/null
+++ b/assets/scalers/xbrz.hlsl
@@ -0,0 +1,335 @@
+
+// 4xBRZ shader - Copyright (C) 2014-2016 DeSmuME team
+//
+// This file is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This file is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with the this software.  If not, see <http://www.gnu.org/licenses/>.
+
+
+/*
+   Hyllian's xBR-vertex code and texel mapping
+
+   Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#define BLEND_NONE 0
+#define BLEND_NORMAL 1
+#define BLEND_DOMINANT 2
+#define LUMINANCE_WEIGHT 1.0
+#define EQUAL_COLOR_TOLERANCE 30.0/255.0
+#define STEEP_DIRECTION_THRESHOLD 2.2
+#define DOMINANT_DIRECTION_THRESHOLD 3.6
+
+float reduce(float4 color)
+{
+  return dot(color.rgb, float3(65536.0, 256.0, 1.0));
+}
+
+float DistYCbCr(float4 pixA, float4 pixB)
+{
+  const float3 w = float3(0.2627, 0.6780, 0.0593);
+  const float scaleB = 0.5 / (1.0 - w.b);
+  const float scaleR = 0.5 / (1.0 - w.r);
+  float3 diff = pixA.rgb - pixB.rgb;
+  float Y = dot(diff, w);
+  float Cb = scaleB * (diff.b - Y);
+  float Cr = scaleR * (diff.r - Y);
+
+  return sqrt( ((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr) );
+}
+
+bool IsPixEqual(const float4 pixA, const float4 pixB)
+{
+  return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
+}
+
+bool IsBlendingNeeded(const int4 blend)
+{
+  return any(blend - BLEND_NONE);
+}
+
+//---------------------------------------
+// Input Pixel Mapping:    --|21|22|23|--
+//                         19|06|07|08|09
+//                         18|05|00|01|10
+//                         17|04|03|02|11
+//                         --|15|14|13|--
+//
+// Output Pixel Mapping: 20|21|22|23|24|25
+//                       19|06|07|08|09|26
+//                       18|05|00|01|10|27
+//                       17|04|03|02|11|28
+//                       16|15|14|13|12|29
+//                       35|34|33|32|31|30
+
+float4 tex_sample(float2 coord)
+{
+  float dx = u_texSize.z;
+  float dy = u_texSize.w;
+
+    //  A1 B1 C1
+  // A0 A  B  C C4
+  // D0 D  E  F F4
+  // G0 G  H  I I4
+    //  G5 H5 I5
+
+  float4 t1 = coord.xxxy + float4( -dx, 0.0, dx,-2.0*dy); // A1 B1 C1
+  float4 t2 = coord.xxxy + float4( -dx, 0.0, dx, -dy); // A B C
+  float4 t3 = coord.xxxy + float4( -dx, 0.0, dx, 0.0); // D E F
+  float4 t4 = coord.xxxy + float4( -dx, 0.0, dx, dy); // G H I
+  float4 t5 = coord.xxxy + float4( -dx, 0.0, dx, 2.0*dy); // G5 H5 I5
+  float4 t6 = coord.xyyy + float4(-2.0*dx,-dy, 0.0, dy); // A0 D0 G0
+  float4 t7 = coord.xyyy + float4( 2.0*dx,-dy, 0.0, dy); // C4 F4 I4
+
+  float2 f = frac(coord.xy * u_texSize.xy);
+
+  //---------------------------------------
+  // Input Pixel Mapping:    |21|22|23|
+  //                       19|06|07|08|09
+  //                       18|05|00|01|10
+  //                       17|04|03|02|11
+  //                         |15|14|13|
+
+  float4 src[25];
+
+  src[21] = premultiply_alpha(tex_sample_direct(t1.xw));
+  src[22] = premultiply_alpha(tex_sample_direct(t1.yw));
+  src[23] = premultiply_alpha(tex_sample_direct(t1.zw));
+  src[ 6] = premultiply_alpha(tex_sample_direct(t2.xw));
+  src[ 7] = premultiply_alpha(tex_sample_direct(t2.yw));
+  src[ 8] = premultiply_alpha(tex_sample_direct(t2.zw));
+  src[ 5] = premultiply_alpha(tex_sample_direct(t3.xw));
+  src[ 0] = premultiply_alpha(tex_sample_direct(t3.yw));
+  src[ 1] = premultiply_alpha(tex_sample_direct(t3.zw));
+  src[ 4] = premultiply_alpha(tex_sample_direct(t4.xw));
+  src[ 3] = premultiply_alpha(tex_sample_direct(t4.yw));
+  src[ 2] = premultiply_alpha(tex_sample_direct(t4.zw));
+  src[15] = premultiply_alpha(tex_sample_direct(t5.xw));
+  src[14] = premultiply_alpha(tex_sample_direct(t5.yw));
+  src[13] = premultiply_alpha(tex_sample_direct(t5.zw));
+  src[19] = premultiply_alpha(tex_sample_direct(t6.xy));
+  src[18] = premultiply_alpha(tex_sample_direct(t6.xz));
+  src[17] = premultiply_alpha(tex_sample_direct(t6.xw));
+  src[ 9] = premultiply_alpha(tex_sample_direct(t7.xy));
+  src[10] = premultiply_alpha(tex_sample_direct(t7.xz));
+  src[11] = premultiply_alpha(tex_sample_direct(t7.xw));
+
+  float v[9];
+  v[0] = reduce(src[0]);
+  v[1] = reduce(src[1]);
+  v[2] = reduce(src[2]);
+  v[3] = reduce(src[3]);
+  v[4] = reduce(src[4]);
+  v[5] = reduce(src[5]);
+  v[6] = reduce(src[6]);
+  v[7] = reduce(src[7]);
+  v[8] = reduce(src[8]);
+
+  int4 blendResult = BLEND_NONE;
+
+  // Preprocess corners
+  // Pixel Tap Mapping: --|--|--|--|--
+  //                    --|--|07|08|--
+  //                    --|05|00|01|10
+  //                    --|04|03|02|11
+  //                    --|--|14|13|--
+  // Corner (1, 1)
+  if ( ((v[0] == v[1] && v[3] == v[2]) || (v[0] == v[3] && v[1] == v[2])) == false)
+  {
+    float dist_03_01 = DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + DistYCbCr(src[14], src[ 2]) + DistYCbCr(src[ 2], src[10]) + (4.0 * DistYCbCr(src[ 3], src[ 1]));
+    float dist_00_02 = DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[ 3], src[13]) + DistYCbCr(src[ 7], src[ 1]) + DistYCbCr(src[ 1], src[11]) + (4.0 * DistYCbCr(src[ 0], src[ 2]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_03_01) < dist_00_02;
+    blendResult[2] = ((dist_03_01 < dist_00_02) && (v[0] != v[1]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|--|--|--|--
+  //                    --|06|07|--|--
+  //                    18|05|00|01|--
+  //                    17|04|03|02|--
+  //                    --|15|14|--|--
+  // Corner (0, 1)
+  if ( ((v[5] == v[0] && v[4] == v[3]) || (v[5] == v[4] && v[0] == v[3])) == false)
+  {
+    float dist_04_00 = DistYCbCr(src[17], src[ 5]) + DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[15], src[ 3]) + DistYCbCr(src[ 3], src[ 1]) + (4.0 * DistYCbCr(src[ 4], src[ 0]));
+    float dist_05_03 = DistYCbCr(src[18], src[ 4]) + DistYCbCr(src[ 4], src[14]) + DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + (4.0 * DistYCbCr(src[ 5], src[ 3]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_03) < dist_04_00;
+    blendResult[3] = ((dist_04_00 > dist_05_03) && (v[0] != v[5]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|--|22|23|--
+  //                    --|06|07|08|09
+  //                    --|05|00|01|10
+  //                    --|--|03|02|--
+  //                    --|--|--|--|--
+  // Corner (1, 0)
+  if ( ((v[7] == v[8] && v[0] == v[1]) || (v[7] == v[0] && v[8] == v[1])) == false)
+  {
+    float dist_00_08 = DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[ 7], src[23]) + DistYCbCr(src[ 3], src[ 1]) + DistYCbCr(src[ 1], src[ 9]) + (4.0 * DistYCbCr(src[ 0], src[ 8]));
+    float dist_07_01 = DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + DistYCbCr(src[22], src[ 8]) + DistYCbCr(src[ 8], src[10]) + (4.0 * DistYCbCr(src[ 7], src[ 1]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_07_01) < dist_00_08;
+    blendResult[1] = ((dist_00_08 > dist_07_01) && (v[0] != v[7]) && (v[0] != v[1])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|21|22|--|--
+  //                    19|06|07|08|--
+  //                    18|05|00|01|--
+  //                    --|04|03|--|--
+  //                    --|--|--|--|--
+  // Corner (0, 0)
+  if ( ((v[6] == v[7] && v[5] == v[0]) || (v[6] == v[5] && v[7] == v[0])) == false)
+  {
+    float dist_05_07 = DistYCbCr(src[18], src[ 6]) + DistYCbCr(src[ 6], src[22]) + DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + (4.0 * DistYCbCr(src[ 5], src[ 7]));
+    float dist_06_00 = DistYCbCr(src[19], src[ 5]) + DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[21], src[ 7]) + DistYCbCr(src[ 7], src[ 1]) + (4.0 * DistYCbCr(src[ 6], src[ 0]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_07) < dist_06_00;
+    blendResult[0] = ((dist_05_07 < dist_06_00) && (v[0] != v[5]) && (v[0] != v[7])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  float4 dst[16];
+  dst[ 0] = src[0];
+  dst[ 1] = src[0];
+  dst[ 2] = src[0];
+  dst[ 3] = src[0];
+  dst[ 4] = src[0];
+  dst[ 5] = src[0];
+  dst[ 6] = src[0];
+  dst[ 7] = src[0];
+  dst[ 8] = src[0];
+  dst[ 9] = src[0];
+  dst[10] = src[0];
+  dst[11] = src[0];
+  dst[12] = src[0];
+  dst[13] = src[0];
+  dst[14] = src[0];
+  dst[15] = src[0];
+
+  // Scale pixel
+  if (IsBlendingNeeded(blendResult) == true)
+  {
+    float dist_01_04 = DistYCbCr(src[1], src[4]);
+    float dist_03_08 = DistYCbCr(src[3], src[8]);
+    bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[4]) && (v[5] != v[4]);
+    bool haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[8]) && (v[7] != v[8]);
+    bool needBlend = (blendResult[2] != BLEND_NONE);
+    bool doLineBlend = (  blendResult[2] >= BLEND_DOMINANT ||
+               ((blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
+               (blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
+               (IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[0], src[2]) == false) ) == false );
+
+    float4 blendPix = ( DistYCbCr(src[0], src[1]) <= DistYCbCr(src[0], src[3]) ) ? src[1] : src[3];
+    dst[ 2] = lerp(dst[ 2], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[10] = lerp(dst[10], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[11] = lerp(dst[11], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[13] = lerp(dst[13], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[14] = lerp(dst[14], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+    dist_01_04 = DistYCbCr(src[7], src[2]);
+    dist_03_08 = DistYCbCr(src[1], src[6]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[2]) && (v[3] != v[2]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[6]) && (v[5] != v[6]);
+    needBlend = (blendResult[1] != BLEND_NONE);
+    doLineBlend = (  blendResult[1] >= BLEND_DOMINANT ||
+            !((blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
+            (blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
+            (IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && !IsPixEqual(src[0], src[8])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[7]) <= DistYCbCr(src[0], src[1]) ) ? src[7] : src[1];
+    dst[ 1] = lerp(dst[ 1], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[10] = lerp(dst[10], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[11] = lerp(dst[11], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+    dist_01_04 = DistYCbCr(src[5], src[8]);
+    dist_03_08 = DistYCbCr(src[7], src[4]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[8]) && (v[1] != v[8]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[4]) && (v[3] != v[4]);
+    needBlend = (blendResult[0] != BLEND_NONE);
+    doLineBlend = (  blendResult[0] >= BLEND_DOMINANT ||
+            !((blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
+            (blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
+            (IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && !IsPixEqual(src[0], src[6])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[5]) <= DistYCbCr(src[0], src[7]) ) ? src[5] : src[7];
+    dst[ 0] = lerp(dst[ 0], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+
+    dist_01_04 = DistYCbCr(src[3], src[6]);
+    dist_03_08 = DistYCbCr(src[5], src[2]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[6]) && (v[7] != v[6]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[2]) && (v[1] != v[2]);
+    needBlend = (blendResult[3] != BLEND_NONE);
+    doLineBlend = (  blendResult[3] >= BLEND_DOMINANT ||
+            !((blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
+            (blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
+            (IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && !IsPixEqual(src[0], src[4])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[3]) <= DistYCbCr(src[0], src[5]) ) ? src[3] : src[5];
+    dst[ 3] = lerp(dst[ 3], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[13] = lerp(dst[13], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[14] = lerp(dst[14], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+  }
+
+  // select output pixel
+  float4 res = lerp(lerp(lerp(lerp(dst[ 6], dst[ 7], step(0.25, f.x)),
+                              lerp(dst[ 8], dst[ 9], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         lerp(lerp(dst[ 5], dst[ 0], step(0.25, f.x)),
+                              lerp(dst[ 1], dst[10], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         step(0.25, f.y)),
+                    lerp(lerp(lerp(dst[ 4], dst[ 3], step(0.25, f.x)),
+                              lerp(dst[ 2], dst[11], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         lerp(lerp(dst[15], dst[14], step(0.25, f.x)),
+                              lerp(dst[13], dst[12], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         step(0.75, f.y)),
+                    step(0.50, f.y));
+
+  return postdivide_alpha(res);
+};
diff --git a/libretro/libretro.cpp b/libretro/libretro.cpp
index 7a5bb611b64f..5332f805da64 100644
--- a/libretro/libretro.cpp
+++ b/libretro/libretro.cpp
@@ -175,7 +175,7 @@ static RetroOption<int> ppsspp_button_preference("ppsspp_button_preference", "Co
 static RetroOption<bool> ppsspp_fast_memory("ppsspp_fast_memory", "Fast Memory (Speedhack)", true);
 static RetroOption<bool> ppsspp_block_transfer_gpu("ppsspp_block_transfer_gpu", "Block Transfer GPU", true);
 static RetroOption<int> ppsspp_texture_scaling_level("ppsspp_texture_scaling_level", "Texture Scaling Level", { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "0", 0 } });
-static RetroOption<int> ppsspp_texture_scaling_type("ppsspp_texture_scaling_type", "Texture Scaling Type", { { "xbrz", TextureScalerCommon::XBRZ }, { "hybrid", TextureScalerCommon::HYBRID }, { "bicubic", TextureScalerCommon::BICUBIC }, { "hybrid_bicubic", TextureScalerCommon::HYBRID_BICUBIC }, { "SABR", TextureScalerCommon::SABR }, { "gaussian", TextureScalerCommon::GAUSSIAN }, { "cosine", TextureScalerCommon::COSINE } });
+static RetroOption<int> ppsspp_texture_scaling_type("ppsspp_texture_scaling_type", "Texture Scaling Type", { { "xbrz", TextureScalerCommon::XBRZ }, { "hybrid", TextureScalerCommon::HYBRID }, { "bicubic", TextureScalerCommon::BICUBIC }, { "hybrid_bicubic", TextureScalerCommon::HYBRID_BICUBIC }, { "XBR", TextureScalerCommon::XBR }, { "SABR", TextureScalerCommon::SABR }, { "gaussian", TextureScalerCommon::GAUSSIAN }, { "cosine", TextureScalerCommon::COSINE } });
 static RetroOption<bool> ppsspp_texture_scaling_realtime("ppsspp_texture_scaling_realtime", "Realtime Texture Scaling", false);
 static RetroOption<int> ppsspp_texture_anisotropic_filtering("ppsspp_texture_anisotropic_filtering", "Anisotropic Filtering", { "off", "1x", "2x", "4x", "8x", "16x" });
 static RetroOption<bool> ppsspp_texture_deposterize("ppsspp_texture_deposterize", "Texture Deposterize", false);

From e74e7ad990cc2a0e519a884c931340a6a6ecccd0 Mon Sep 17 00:00:00 2001
From: aliaspider <aliaspider@gmail.com>
Date: Wed, 4 Apr 2018 07:48:31 +0100
Subject: [PATCH 3/3] D3D: gpu scalers: - rework xbrz: improve performance and
 make it scale independant, move the old xbrz code to 4xbrz.hlsl. - better
 handling of alpha in xbr/xbrz shaders, fixes text scaling with some games.

---
 GPU/Common/TextureScalerCommon.cpp       |   1 +
 GPU/Common/TextureScalerCommon.h         |   2 +-
 GPU/D3D11/ShaderManagerD3D11.cpp         |   1 +
 GPU/Directx9/PixelShaderGeneratorDX9.cpp |   7 +-
 UI/GameSettingsScreen.cpp                |   2 +-
 Windows/MainWindowMenu.cpp               |   4 +
 Windows/ppsspp.rc                        |   3 +-
 Windows/resource.h                       |   9 +-
 assets/scalers/4xbrz.hlsl                | 343 +++++++++++++++++
 assets/scalers/bicubic.hlsl              |   8 +
 assets/scalers/cosine.hlsl               |   7 +
 assets/scalers/gaussian.hlsl             |   7 +
 assets/scalers/sabr.hlsl                 |   7 +
 assets/scalers/xbr.hlsl                  |  94 ++---
 assets/scalers/xbrz.hlsl                 | 459 +++++++++++------------
 libretro/libretro.cpp                    |   2 +-
 16 files changed, 652 insertions(+), 304 deletions(-)
 create mode 100644 assets/scalers/4xbrz.hlsl

diff --git a/GPU/Common/TextureScalerCommon.cpp b/GPU/Common/TextureScalerCommon.cpp
index d01dd3e605a2..9ea91f1cce8d 100644
--- a/GPU/Common/TextureScalerCommon.cpp
+++ b/GPU/Common/TextureScalerCommon.cpp
@@ -558,6 +558,7 @@ bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, u32 &dstFmt, int &
 	case XBR:
 	case SABR:
 		// no cpu implementation, fall back to xbrz
+	case _4XBRZ:
 	case XBRZ:
 		ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
 		break;
diff --git a/GPU/Common/TextureScalerCommon.h b/GPU/Common/TextureScalerCommon.h
index 8711f77b7cba..56470c2c42ed 100644
--- a/GPU/Common/TextureScalerCommon.h
+++ b/GPU/Common/TextureScalerCommon.h
@@ -31,7 +31,7 @@ class TextureScalerCommon {
 	bool Scale(u32 *&data, u32 &dstfmt, int &width, int &height, int factor);
 	bool ScaleInto(u32 *out, u32 *src, u32 &dstfmt, int &width, int &height, int factor);
 
-	enum { XBRZ = 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3, XBR = 4, SABR = 5, GAUSSIAN = 6, COSINE = 7 };
+	enum { XBRZ = 0, HYBRID = 1, BICUBIC = 2, HYBRID_BICUBIC = 3, _4XBRZ = 4, XBR = 5, SABR = 6, GAUSSIAN = 7, COSINE = 8 };
 
 protected:
 	virtual void ConvertTo8888(u32 format, u32 *source, u32 *&dest, int width, int height) = 0;
diff --git a/GPU/D3D11/ShaderManagerD3D11.cpp b/GPU/D3D11/ShaderManagerD3D11.cpp
index 2ce967bf2065..6cfa216de0e2 100644
--- a/GPU/D3D11/ShaderManagerD3D11.cpp
+++ b/GPU/D3D11/ShaderManagerD3D11.cpp
@@ -219,6 +219,7 @@ void ShaderManagerD3D11::GetShaders(int prim, u32 vertType, D3D11VertexShader **
 				"scalers/hybrid.hlsl",
 				"scalers/bicubic.hlsl",
 				"scalers/hybrid_bicubic.hlsl",
+				"scalers/4xbrz.hlsl",
 				"scalers/xbr.hlsl",
 				"scalers/sabr.hlsl",
 				"scalers/gaussian.hlsl",
diff --git a/GPU/Directx9/PixelShaderGeneratorDX9.cpp b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
index 6207ab22d850..093a2fd1317b 100644
--- a/GPU/Directx9/PixelShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/PixelShaderGeneratorDX9.cpp
@@ -152,12 +152,7 @@ bool GenerateFragmentShaderHLSL(const FShaderID &id, char *buffer, char* scalerC
 
 	if (!isModeClear && doTexture) {
 		if (doTextureAlpha) {
-			// TODO: check why the [0.0,1.0] clamp is necessary here
-			WRITE(p, "float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }");
-			WRITE(p, "float4 postdivide_alpha(float4 c) { return c.a < 0.0001f? 0.0f : float4(c.rgb / c.a, c.a); }\n");
-		} else {
-			WRITE(p, "#define premultiply_alpha(c) (c)\n");
-			WRITE(p, "#define postdivide_alpha(c) (c)\n");
+			WRITE(p, "#define BLEND_ALPHA\n");
 		}
 
 		WRITE(p, "float4 tex_sample_direct(float2 coord) {\n");
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index 5b297ff8a446..3b5d0fdc5644 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -378,7 +378,7 @@ void GameSettingsScreen::CreateViews() {
 	});
 	texScalingChoice->SetDisabledPtr(&g_Config.bSoftwareRendering);
 
-	static const char *texScaleAlgos[] = { "xBRZ", "Hybrid", "Bicubic", "Hybrid + Bicubic", "XBR", "SABR", "Gaussian", "Cosine"};
+	static const char *texScaleAlgos[] = { "xBRZ", "Hybrid", "Bicubic", "Hybrid + Bicubic", "4xBRZ", "xBR", "SABR", "Gaussian", "Cosine"};
 	PopupMultiChoice *texScalingType = graphicsSettings->Add(new PopupMultiChoice(&g_Config.iTexScalingType, gr->T("Upscale Type"), texScaleAlgos, 0, ARRAY_SIZE(texScaleAlgos), gr->GetName(), screenManager()));
 	texScalingType->SetDisabledPtr(&g_Config.bSoftwareRendering);
 
diff --git a/Windows/MainWindowMenu.cpp b/Windows/MainWindowMenu.cpp
index 73a4ef571cae..d46688cfcb25 100644
--- a/Windows/MainWindowMenu.cpp
+++ b/Windows/MainWindowMenu.cpp
@@ -346,6 +346,8 @@ namespace MainWindow {
 		TranslateMenuItem(menu, ID_TEXTURESCALING_HYBRID);
 		TranslateMenuItem(menu, ID_TEXTURESCALING_BICUBIC);
 		TranslateMenuItem(menu, ID_TEXTURESCALING_HYBRID_BICUBIC);
+		TranslateMenuItem(menu, ID_TEXTURESCALING_4XBRZ);
+		TranslateMenuItem(menu, ID_TEXTURESCALING_XBR);
 		TranslateMenuItem(menu, ID_TEXTURESCALING_SABR);
 		TranslateMenuItem(menu, ID_TEXTURESCALING_GAUSSIAN);
 		TranslateMenuItem(menu, ID_TEXTURESCALING_COSINE);
@@ -787,6 +789,7 @@ namespace MainWindow {
 		case ID_TEXTURESCALING_HYBRID:          setTexScalingType(TextureScalerCommon::HYBRID); break;
 		case ID_TEXTURESCALING_BICUBIC:         setTexScalingType(TextureScalerCommon::BICUBIC); break;
 		case ID_TEXTURESCALING_HYBRID_BICUBIC:  setTexScalingType(TextureScalerCommon::HYBRID_BICUBIC); break;
+		case ID_TEXTURESCALING_4XBRZ:           setTexScalingType(TextureScalerCommon::_4XBRZ); break;
 		case ID_TEXTURESCALING_XBR:             setTexScalingType(TextureScalerCommon::XBR); break;
 		case ID_TEXTURESCALING_SABR:            setTexScalingType(TextureScalerCommon::SABR); break;
 		case ID_TEXTURESCALING_GAUSSIAN:        setTexScalingType(TextureScalerCommon::GAUSSIAN); break;
@@ -1215,6 +1218,7 @@ namespace MainWindow {
 			ID_TEXTURESCALING_HYBRID,
 			ID_TEXTURESCALING_BICUBIC,
 			ID_TEXTURESCALING_HYBRID_BICUBIC,
+			ID_TEXTURESCALING_4XBRZ,
 			ID_TEXTURESCALING_XBR,
 			ID_TEXTURESCALING_SABR,
 			ID_TEXTURESCALING_GAUSSIAN,
diff --git a/Windows/ppsspp.rc b/Windows/ppsspp.rc
index 47a74e494e17..81006823d43d 100644
--- a/Windows/ppsspp.rc
+++ b/Windows/ppsspp.rc
@@ -613,7 +613,8 @@ BEGIN
             MENUITEM "Hybrid",                          ID_TEXTURESCALING_HYBRID
             MENUITEM "Bicubic",                         ID_TEXTURESCALING_BICUBIC
             MENUITEM "Hybrid + Bicubic",                ID_TEXTURESCALING_HYBRID_BICUBIC
-            MENUITEM "XBR",                             ID_TEXTURESCALING_XBR
+            MENUITEM "4xBRZ",                           ID_TEXTURESCALING_4XBRZ
+            MENUITEM "xBR",                             ID_TEXTURESCALING_XBR
             MENUITEM "SABR",                            ID_TEXTURESCALING_SABR
             MENUITEM "Gaussian",                        ID_TEXTURESCALING_GAUSSIAN
             MENUITEM "Cosine",                          ID_TEXTURESCALING_COSINE
diff --git a/Windows/resource.h b/Windows/resource.h
index 9a8c9f763d8e..ba0fbbfc4dcb 100644
--- a/Windows/resource.h
+++ b/Windows/resource.h
@@ -342,10 +342,11 @@
 
 #define ID_TEXTURESCALING_REALTIME       40177
 #define ID_TEXTURESCALING_REALTIME_HC    40178
-#define ID_TEXTURESCALING_XBR            40179
-#define ID_TEXTURESCALING_SABR           40180
-#define ID_TEXTURESCALING_GAUSSIAN       40181
-#define ID_TEXTURESCALING_COSINE         40182
+#define ID_TEXTURESCALING_4XBRZ          40179
+#define ID_TEXTURESCALING_XBR            40180
+#define ID_TEXTURESCALING_SABR           40181
+#define ID_TEXTURESCALING_GAUSSIAN       40182
+#define ID_TEXTURESCALING_COSINE         40183
 
 // Dummy option to let the buffered rendering hotkey cycle through all the options.
 #define ID_OPTIONS_BUFFEREDRENDERINGDUMMY 40500
diff --git a/assets/scalers/4xbrz.hlsl b/assets/scalers/4xbrz.hlsl
new file mode 100644
index 000000000000..6c54b05d53d1
--- /dev/null
+++ b/assets/scalers/4xbrz.hlsl
@@ -0,0 +1,343 @@
+
+// 4xBRZ shader - Copyright (C) 2014-2016 DeSmuME team
+//
+// This file is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This file is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with the this software.  If not, see <http://www.gnu.org/licenses/>.
+
+
+/*
+   Hyllian's xBR-vertex code and texel mapping
+
+   Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
+
+#define BLEND_NONE 0
+#define BLEND_NORMAL 1
+#define BLEND_DOMINANT 2
+#define LUMINANCE_WEIGHT 1.0
+#define EQUAL_COLOR_TOLERANCE 30.0/255.0
+#define STEEP_DIRECTION_THRESHOLD 2.2
+#define DOMINANT_DIRECTION_THRESHOLD 3.6
+
+float reduce(float4 color)
+{
+  return dot(color.rgb, float3(65536.0, 256.0, 1.0));
+}
+
+float DistYCbCr(float4 pixA, float4 pixB)
+{
+  const float3 w = float3(0.2627, 0.6780, 0.0593);
+  const float scaleB = 0.5 / (1.0 - w.b);
+  const float scaleR = 0.5 / (1.0 - w.r);
+  float4 diff = pixA - pixB;
+  float Y = dot(diff.rgb, w);
+  float Cb = scaleB * (diff.b - Y);
+  float Cr = scaleR * (diff.r - Y);
+
+  return sqrt( ((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr) + (diff.a * diff.a));
+}
+
+bool IsPixEqual(const float4 pixA, const float4 pixB)
+{
+  return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
+}
+
+bool IsBlendingNeeded(const int4 blend)
+{
+  return any(blend - BLEND_NONE);
+}
+
+//---------------------------------------
+// Input Pixel Mapping:    --|21|22|23|--
+//                         19|06|07|08|09
+//                         18|05|00|01|10
+//                         17|04|03|02|11
+//                         --|15|14|13|--
+//
+// Output Pixel Mapping: 20|21|22|23|24|25
+//                       19|06|07|08|09|26
+//                       18|05|00|01|10|27
+//                       17|04|03|02|11|28
+//                       16|15|14|13|12|29
+//                       35|34|33|32|31|30
+
+float4 tex_sample(float2 coord)
+{
+  float dx = u_texSize.z;
+  float dy = u_texSize.w;
+
+    //  A1 B1 C1
+  // A0 A  B  C C4
+  // D0 D  E  F F4
+  // G0 G  H  I I4
+    //  G5 H5 I5
+
+  float4 t1 = coord.xxxy + float4( -dx, 0.0, dx,-2.0*dy); // A1 B1 C1
+  float4 t2 = coord.xxxy + float4( -dx, 0.0, dx, -dy); // A B C
+  float4 t3 = coord.xxxy + float4( -dx, 0.0, dx, 0.0); // D E F
+  float4 t4 = coord.xxxy + float4( -dx, 0.0, dx, dy); // G H I
+  float4 t5 = coord.xxxy + float4( -dx, 0.0, dx, 2.0*dy); // G5 H5 I5
+  float4 t6 = coord.xyyy + float4(-2.0*dx,-dy, 0.0, dy); // A0 D0 G0
+  float4 t7 = coord.xyyy + float4( 2.0*dx,-dy, 0.0, dy); // C4 F4 I4
+
+  float2 f = frac(coord.xy * u_texSize.xy);
+
+  //---------------------------------------
+  // Input Pixel Mapping:    |21|22|23|
+  //                       19|06|07|08|09
+  //                       18|05|00|01|10
+  //                       17|04|03|02|11
+  //                         |15|14|13|
+
+  float4 src[25];
+
+  src[21] = premultiply_alpha(tex_sample_direct(t1.xw));
+  src[22] = premultiply_alpha(tex_sample_direct(t1.yw));
+  src[23] = premultiply_alpha(tex_sample_direct(t1.zw));
+  src[ 6] = premultiply_alpha(tex_sample_direct(t2.xw));
+  src[ 7] = premultiply_alpha(tex_sample_direct(t2.yw));
+  src[ 8] = premultiply_alpha(tex_sample_direct(t2.zw));
+  src[ 5] = premultiply_alpha(tex_sample_direct(t3.xw));
+  src[ 0] = premultiply_alpha(tex_sample_direct(t3.yw));
+  src[ 1] = premultiply_alpha(tex_sample_direct(t3.zw));
+  src[ 4] = premultiply_alpha(tex_sample_direct(t4.xw));
+  src[ 3] = premultiply_alpha(tex_sample_direct(t4.yw));
+  src[ 2] = premultiply_alpha(tex_sample_direct(t4.zw));
+  src[15] = premultiply_alpha(tex_sample_direct(t5.xw));
+  src[14] = premultiply_alpha(tex_sample_direct(t5.yw));
+  src[13] = premultiply_alpha(tex_sample_direct(t5.zw));
+  src[19] = premultiply_alpha(tex_sample_direct(t6.xy));
+  src[18] = premultiply_alpha(tex_sample_direct(t6.xz));
+  src[17] = premultiply_alpha(tex_sample_direct(t6.xw));
+  src[ 9] = premultiply_alpha(tex_sample_direct(t7.xy));
+  src[10] = premultiply_alpha(tex_sample_direct(t7.xz));
+  src[11] = premultiply_alpha(tex_sample_direct(t7.xw));
+
+  float v[9];
+  v[0] = reduce(src[0]);
+  v[1] = reduce(src[1]);
+  v[2] = reduce(src[2]);
+  v[3] = reduce(src[3]);
+  v[4] = reduce(src[4]);
+  v[5] = reduce(src[5]);
+  v[6] = reduce(src[6]);
+  v[7] = reduce(src[7]);
+  v[8] = reduce(src[8]);
+
+  int4 blendResult = BLEND_NONE;
+
+  // Preprocess corners
+  // Pixel Tap Mapping: --|--|--|--|--
+  //                    --|--|07|08|--
+  //                    --|05|00|01|10
+  //                    --|04|03|02|11
+  //                    --|--|14|13|--
+  // Corner (1, 1)
+  if ( ((v[0] == v[1] && v[3] == v[2]) || (v[0] == v[3] && v[1] == v[2])) == false)
+  {
+    float dist_03_01 = DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + DistYCbCr(src[14], src[ 2]) + DistYCbCr(src[ 2], src[10]) + (4.0 * DistYCbCr(src[ 3], src[ 1]));
+    float dist_00_02 = DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[ 3], src[13]) + DistYCbCr(src[ 7], src[ 1]) + DistYCbCr(src[ 1], src[11]) + (4.0 * DistYCbCr(src[ 0], src[ 2]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_03_01) < dist_00_02;
+    blendResult[2] = ((dist_03_01 < dist_00_02) && (v[0] != v[1]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|--|--|--|--
+  //                    --|06|07|--|--
+  //                    18|05|00|01|--
+  //                    17|04|03|02|--
+  //                    --|15|14|--|--
+  // Corner (0, 1)
+  if ( ((v[5] == v[0] && v[4] == v[3]) || (v[5] == v[4] && v[0] == v[3])) == false)
+  {
+    float dist_04_00 = DistYCbCr(src[17], src[ 5]) + DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[15], src[ 3]) + DistYCbCr(src[ 3], src[ 1]) + (4.0 * DistYCbCr(src[ 4], src[ 0]));
+    float dist_05_03 = DistYCbCr(src[18], src[ 4]) + DistYCbCr(src[ 4], src[14]) + DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + (4.0 * DistYCbCr(src[ 5], src[ 3]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_03) < dist_04_00;
+    blendResult[3] = ((dist_04_00 > dist_05_03) && (v[0] != v[5]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|--|22|23|--
+  //                    --|06|07|08|09
+  //                    --|05|00|01|10
+  //                    --|--|03|02|--
+  //                    --|--|--|--|--
+  // Corner (1, 0)
+  if ( ((v[7] == v[8] && v[0] == v[1]) || (v[7] == v[0] && v[8] == v[1])) == false)
+  {
+    float dist_00_08 = DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[ 7], src[23]) + DistYCbCr(src[ 3], src[ 1]) + DistYCbCr(src[ 1], src[ 9]) + (4.0 * DistYCbCr(src[ 0], src[ 8]));
+    float dist_07_01 = DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + DistYCbCr(src[22], src[ 8]) + DistYCbCr(src[ 8], src[10]) + (4.0 * DistYCbCr(src[ 7], src[ 1]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_07_01) < dist_00_08;
+    blendResult[1] = ((dist_00_08 > dist_07_01) && (v[0] != v[7]) && (v[0] != v[1])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  // Pixel Tap Mapping: --|21|22|--|--
+  //                    19|06|07|08|--
+  //                    18|05|00|01|--
+  //                    --|04|03|--|--
+  //                    --|--|--|--|--
+  // Corner (0, 0)
+  if ( ((v[6] == v[7] && v[5] == v[0]) || (v[6] == v[5] && v[7] == v[0])) == false)
+  {
+    float dist_05_07 = DistYCbCr(src[18], src[ 6]) + DistYCbCr(src[ 6], src[22]) + DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + (4.0 * DistYCbCr(src[ 5], src[ 7]));
+    float dist_06_00 = DistYCbCr(src[19], src[ 5]) + DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[21], src[ 7]) + DistYCbCr(src[ 7], src[ 1]) + (4.0 * DistYCbCr(src[ 6], src[ 0]));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_07) < dist_06_00;
+    blendResult[0] = ((dist_05_07 < dist_06_00) && (v[0] != v[5]) && (v[0] != v[7])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
+
+  float4 dst[16];
+  dst[ 0] = src[0];
+  dst[ 1] = src[0];
+  dst[ 2] = src[0];
+  dst[ 3] = src[0];
+  dst[ 4] = src[0];
+  dst[ 5] = src[0];
+  dst[ 6] = src[0];
+  dst[ 7] = src[0];
+  dst[ 8] = src[0];
+  dst[ 9] = src[0];
+  dst[10] = src[0];
+  dst[11] = src[0];
+  dst[12] = src[0];
+  dst[13] = src[0];
+  dst[14] = src[0];
+  dst[15] = src[0];
+
+  // Scale pixel
+  if (IsBlendingNeeded(blendResult) == true)
+  {
+    float dist_01_04 = DistYCbCr(src[1], src[4]);
+    float dist_03_08 = DistYCbCr(src[3], src[8]);
+    bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[4]) && (v[5] != v[4]);
+    bool haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[8]) && (v[7] != v[8]);
+    bool needBlend = (blendResult[2] != BLEND_NONE);
+    bool doLineBlend = (  blendResult[2] >= BLEND_DOMINANT ||
+               ((blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
+               (blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
+               (IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[0], src[2]) == false) ) == false );
+
+    float4 blendPix = ( DistYCbCr(src[0], src[1]) <= DistYCbCr(src[0], src[3]) ) ? src[1] : src[3];
+    dst[ 2] = lerp(dst[ 2], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[10] = lerp(dst[10], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[11] = lerp(dst[11], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[13] = lerp(dst[13], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[14] = lerp(dst[14], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+    dist_01_04 = DistYCbCr(src[7], src[2]);
+    dist_03_08 = DistYCbCr(src[1], src[6]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[2]) && (v[3] != v[2]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[6]) && (v[5] != v[6]);
+    needBlend = (blendResult[1] != BLEND_NONE);
+    doLineBlend = (  blendResult[1] >= BLEND_DOMINANT ||
+            !((blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
+            (blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
+            (IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && !IsPixEqual(src[0], src[8])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[7]) <= DistYCbCr(src[0], src[1]) ) ? src[7] : src[1];
+    dst[ 1] = lerp(dst[ 1], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[10] = lerp(dst[10], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[11] = lerp(dst[11], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+    dist_01_04 = DistYCbCr(src[5], src[8]);
+    dist_03_08 = DistYCbCr(src[7], src[4]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[8]) && (v[1] != v[8]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[4]) && (v[3] != v[4]);
+    needBlend = (blendResult[0] != BLEND_NONE);
+    doLineBlend = (  blendResult[0] >= BLEND_DOMINANT ||
+            !((blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
+            (blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
+            (IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && !IsPixEqual(src[0], src[6])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[5]) <= DistYCbCr(src[0], src[7]) ) ? src[5] : src[7];
+    dst[ 0] = lerp(dst[ 0], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+
+
+    dist_01_04 = DistYCbCr(src[3], src[6]);
+    dist_03_08 = DistYCbCr(src[5], src[2]);
+    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[6]) && (v[7] != v[6]);
+    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[2]) && (v[1] != v[2]);
+    needBlend = (blendResult[3] != BLEND_NONE);
+    doLineBlend = (  blendResult[3] >= BLEND_DOMINANT ||
+            !((blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
+            (blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
+            (IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && !IsPixEqual(src[0], src[4])) ) );
+
+    blendPix = ( DistYCbCr(src[0], src[3]) <= DistYCbCr(src[0], src[5]) ) ? src[3] : src[5];
+    dst[ 3] = lerp(dst[ 3], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
+    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
+    dst[13] = lerp(dst[13], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
+    dst[14] = lerp(dst[14], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[15] = lerp(dst[15], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
+    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
+    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
+    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+  }
+
+  // select output pixel
+  float4 res = lerp(lerp(lerp(lerp(dst[ 6], dst[ 7], step(0.25, f.x)),
+                              lerp(dst[ 8], dst[ 9], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         lerp(lerp(dst[ 5], dst[ 0], step(0.25, f.x)),
+                              lerp(dst[ 1], dst[10], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         step(0.25, f.y)),
+                    lerp(lerp(lerp(dst[ 4], dst[ 3], step(0.25, f.x)),
+                              lerp(dst[ 2], dst[11], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         lerp(lerp(dst[15], dst[14], step(0.25, f.x)),
+                              lerp(dst[13], dst[12], step(0.75, f.x)),
+                              step(0.50, f.x)),
+                         step(0.75, f.y)),
+                    step(0.50, f.y));
+
+  return postdivide_alpha(res);
+};
diff --git a/assets/scalers/bicubic.hlsl b/assets/scalers/bicubic.hlsl
index 8a20886e9d04..4848632e1048 100644
--- a/assets/scalers/bicubic.hlsl
+++ b/assets/scalers/bicubic.hlsl
@@ -1,4 +1,12 @@
 
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
+
 // generate the value of a Mitchell-Netravali scaling spline at distance d, with parameters A and B
 // B=1 C=0   : cubic B spline (very smooth)
 // B=C=1/3   : recommended for general upscaling
diff --git a/assets/scalers/cosine.hlsl b/assets/scalers/cosine.hlsl
index 2481529f940a..a3fd5dd1cfff 100644
--- a/assets/scalers/cosine.hlsl
+++ b/assets/scalers/cosine.hlsl
@@ -1,3 +1,10 @@
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
 
 #define sharpness 1.0
 #define pi 3.14159265358
diff --git a/assets/scalers/gaussian.hlsl b/assets/scalers/gaussian.hlsl
index f57cd4ddcf58..b01982b10b2a 100644
--- a/assets/scalers/gaussian.hlsl
+++ b/assets/scalers/gaussian.hlsl
@@ -1,3 +1,10 @@
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
 
 #define sharpness 1.0
 #define pi 3.14159265358
diff --git a/assets/scalers/sabr.hlsl b/assets/scalers/sabr.hlsl
index 05d0222e694e..a62e0c43a239 100644
--- a/assets/scalers/sabr.hlsl
+++ b/assets/scalers/sabr.hlsl
@@ -1,3 +1,10 @@
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
 
 float c_df(float4 c1, float4 c2) {
 	float3 df = abs(c1.rgb - c2.rgb);
diff --git a/assets/scalers/xbr.hlsl b/assets/scalers/xbr.hlsl
index 546831f36c1a..6dc9a85ac68f 100644
--- a/assets/scalers/xbr.hlsl
+++ b/assets/scalers/xbr.hlsl
@@ -25,6 +25,14 @@
    Incorporates some of the ideas from SABR shader. Thanks to Joshua Street.
 */
 
+#ifdef BLEND_ALPHA
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#endif
+
 #define XBR_Y_WEIGHT          48.0 //  0.0 .. 100.0
 #define XBR_EQ_THRESHOLD      15.0 //  0.0 ..  50.0
 #define XBR_LV1_COEFFICIENT    0.5 //  0.0 ..  30.0
@@ -35,7 +43,7 @@
 #define XBR_SCALE 4.0
 
 static const float coef           = 2.0;
-static const float3  rgbw         = float3(14.352, 28.176, 5.472);
+static const float4  rgbw         = float4(14.352, 28.176, 5.472, 50);
 static const float4  eq_threshold = float4(15.0, 15.0, 15.0, 15.0);
 
 static const float4 delta   = float4(1.0/XBR_SCALE, 1.0/XBR_SCALE, 1.0/XBR_SCALE, 1.0/XBR_SCALE);
@@ -53,7 +61,7 @@ static const float4 By = float4( 2.0,  0.5, -2.0,-0.5 );
 static const float4 Cy = float4( 2.0,  0.0, -1.0, 0.5 );
 static const float4 Ci = float4(0.25, 0.25, 0.25, 0.25);
 
-static const float3 Y = float3(0.2126, 0.7152, 0.0722);
+static const float4 Y = float4(0.2126, 0.7152, 0.0722, 1.0);
 
 // Difference between vector components.
 float4 df(float4 A, float4 B) { return abs(A-B); }
@@ -76,9 +84,9 @@ float4 weighted_distance(float4 a, float4 b, float4 c, float4 d, float4 e, float
 	return (df(a,b) + df(a,c) + df(d,e) + df(d,f) + df(i,j) + df(k,l) + 2.0*df(g,h));
 }
 
-float c_df(float3 c1, float3 c2) {
-  float3 df = abs(c1 - c2);
-  return df.r + df.g + df.b;
+float c_df(float4 c1, float4 c2) {
+  float4 df = abs(c1 - c2);
+  return df.r + df.g + df.b + df.a;
 }
 
 float4 tex_sample(float2 coord) {
@@ -98,32 +106,32 @@ float4 tex_sample(float2 coord) {
 
   float2 fp  = frac(coord*u_texSize.xy);
 
-  float4 A1 = premultiply_alpha(tex_sample_direct(t1.xw));
-  float4 B1 = premultiply_alpha(tex_sample_direct(t1.yw));
-  float4 C1 = premultiply_alpha(tex_sample_direct(t1.zw));
-  float4 A  = premultiply_alpha(tex_sample_direct(t2.xw));
-  float4 B  = premultiply_alpha(tex_sample_direct(t2.yw));
-  float4 C  = premultiply_alpha(tex_sample_direct(t2.zw));
-  float4 D  = premultiply_alpha(tex_sample_direct(t3.xw));
-  float4 E  = premultiply_alpha(tex_sample_direct(t3.yw));
-  float4 F  = premultiply_alpha(tex_sample_direct(t3.zw));
-  float4 G  = premultiply_alpha(tex_sample_direct(t4.xw));
-  float4 H  = premultiply_alpha(tex_sample_direct(t4.yw));
-  float4 I  = premultiply_alpha(tex_sample_direct(t4.zw));
-  float4 G5 = premultiply_alpha(tex_sample_direct(t5.xw));
-  float4 H5 = premultiply_alpha(tex_sample_direct(t5.yw));
-  float4 I5 = premultiply_alpha(tex_sample_direct(t5.zw));
-  float4 A0 = premultiply_alpha(tex_sample_direct(t6.xy));
-  float4 D0 = premultiply_alpha(tex_sample_direct(t6.xz));
-  float4 G0 = premultiply_alpha(tex_sample_direct(t6.xw));
-  float4 C4 = premultiply_alpha(tex_sample_direct(t7.xy));
-  float4 F4 = premultiply_alpha(tex_sample_direct(t7.xz));
-  float4 I4 = premultiply_alpha(tex_sample_direct(t7.xw));
-
-  float4 b  = float4(dot(B.rgb ,rgbw), dot(D.rgb ,rgbw), dot(H.rgb ,rgbw), dot(F.rgb ,rgbw));
-  float4 c  = float4(dot(C.rgb ,rgbw), dot(A.rgb ,rgbw), dot(G.rgb ,rgbw), dot(I.rgb ,rgbw));
+  float4 A1 = tex_sample_direct(t1.xw);
+  float4 B1 = tex_sample_direct(t1.yw);
+  float4 C1 = tex_sample_direct(t1.zw);
+  float4 A  = tex_sample_direct(t2.xw);
+  float4 B  = tex_sample_direct(t2.yw);
+  float4 C  = tex_sample_direct(t2.zw);
+  float4 D  = tex_sample_direct(t3.xw);
+  float4 E  = tex_sample_direct(t3.yw);
+  float4 F  = tex_sample_direct(t3.zw);
+  float4 G  = tex_sample_direct(t4.xw);
+  float4 H  = tex_sample_direct(t4.yw);
+  float4 I  = tex_sample_direct(t4.zw);
+  float4 G5 = tex_sample_direct(t5.xw);
+  float4 H5 = tex_sample_direct(t5.yw);
+  float4 I5 = tex_sample_direct(t5.zw);
+  float4 A0 = tex_sample_direct(t6.xy);
+  float4 D0 = tex_sample_direct(t6.xz);
+  float4 G0 = tex_sample_direct(t6.xw);
+  float4 C4 = tex_sample_direct(t7.xy);
+  float4 F4 = tex_sample_direct(t7.xz);
+  float4 I4 = tex_sample_direct(t7.xw);
+
+  float4 b  = float4(dot(B, rgbw), dot(D, rgbw), dot(H, rgbw), dot(F, rgbw));
+  float4 c  = float4(dot(C, rgbw), dot(A, rgbw), dot(G, rgbw), dot(I, rgbw));
   float4 d  = b.yzwx;
-  float4 e  = dot(E.rgb,rgbw);
+  float4 e  = dot(E, rgbw);
   float4 f  = b.wxyz;
   float4 g  = c.zwxy;
   float4 h  = b.zwxy;
@@ -133,13 +141,13 @@ float4 tex_sample(float2 coord) {
 
   float y_weight = XBR_Y_WEIGHT;
 #ifdef SMALL_DETAILS
-  i4 = mul(float4x3(I4.rgb, C1.rgb, A0.rgb, G5.rgb), y_weight * Y);
-  i5 = mul(float4x3(I5.rgb, C4.rgb, A1.rgb, G0.rgb), y_weight * Y);
-  h5 = mul(float4x3(H5.rgb, F4.rgb, B1.rgb, D0.rgb), y_weight * Y);
+  i4 = mul(float4x4(I4, C1, A0, G5), y_weight * Y);
+  i5 = mul(float4x4(I5, C4, A1, G0), y_weight * Y);
+  h5 = mul(float4x4(H5, F4, B1, D0), y_weight * Y);
 #else
-  i4 = float4(dot(I4.rgb,rgbw), dot(C1.rgb,rgbw), dot(A0.rgb,rgbw), dot(G5.rgb,rgbw));
-  i5 = float4(dot(I5.rgb,rgbw), dot(C4.rgb,rgbw), dot(A1.rgb,rgbw), dot(G0.rgb,rgbw));
-  h5 = float4(dot(H5.rgb,rgbw), dot(F4.rgb,rgbw), dot(B1.rgb,rgbw), dot(D0.rgb,rgbw));
+  i4 = float4(dot(I4,rgbw), dot(C1,rgbw), dot(A0,rgbw), dot(G5,rgbw));
+  i5 = float4(dot(I5,rgbw), dot(C4,rgbw), dot(A1,rgbw), dot(G0,rgbw));
+  h5 = float4(dot(H5,rgbw), dot(F4,rgbw), dot(B1,rgbw), dot(D0,rgbw));
 #endif
   f4 = h5.yzwx;
 
@@ -197,14 +205,14 @@ float4 tex_sample(float2 coord) {
   float4 maximos = max(max(fx30, fx60), fx45);
 #endif
 
-  float4 res1 = E;
-  res1 = lerp(res1, lerp(H, F, px.x), maximos.x);
-  res1 = lerp(res1, lerp(B, D, px.z), maximos.z);
+  float4 res1 = premultiply_alpha(E);
+  res1 = lerp(res1, premultiply_alpha(lerp(H, F, px.x)), maximos.x);
+  res1 = lerp(res1, premultiply_alpha(lerp(B, D, px.z)), maximos.z);
 
-  float4 res2 = E;
-  res2 = lerp(res2, lerp(F, B, px.y), maximos.y);
-  res2 = lerp(res2, lerp(D, H, px.w), maximos.w);
+  float4 res2 = premultiply_alpha(E);
+  res2 = lerp(res2, premultiply_alpha(lerp(F, B, px.y)), maximos.y);
+  res2 = lerp(res2, premultiply_alpha(lerp(D, H, px.w)), maximos.w);
 
-  float4 res = lerp(res1, res2, step(c_df(E.rgb, res1.rgb), c_df(E.rgb, res2.rgb)));
+  float4 res = lerp(res1, res2, step(c_df(E, res1), c_df(E, res2)));
   return postdivide_alpha(res);
 }
diff --git a/assets/scalers/xbrz.hlsl b/assets/scalers/xbrz.hlsl
index 43745eb453ca..4031722b8587 100644
--- a/assets/scalers/xbrz.hlsl
+++ b/assets/scalers/xbrz.hlsl
@@ -1,3 +1,6 @@
+// xBRZ-freescale
+// based on :
+
 
 // 4xBRZ shader - Copyright (C) 2014-2016 DeSmuME team
 //
@@ -46,23 +49,38 @@
 #define EQUAL_COLOR_TOLERANCE 30.0/255.0
 #define STEEP_DIRECTION_THRESHOLD 2.2
 #define DOMINANT_DIRECTION_THRESHOLD 3.6
-
-float reduce(float4 color)
-{
-  return dot(color.rgb, float3(65536.0, 256.0, 1.0));
-}
+#define BLEND_ALPHA
+
+#ifdef BLEND_ALPHA
+// TODO: check why the [0.0,1.0] clamp is necessary here
+float4 premultiply_alpha(float4 c) { float a = clamp(c.a, 0.0, 1.0); return float4(c.rgb * a, a); }
+float4 postdivide_alpha(float4 c) { return c.a < 0.001f? float4(0.0f,0.0f,0.0f,0.0f) : float4(c.rgb / c.a, c.a); }
+#define eq(a,b)  all(a == b)
+#define neq(a,b) any(a != b)
+#else
+#define premultiply_alpha(c) (c)
+#define postdivide_alpha(c) (c)
+#define eq(a,b)  all(a.rgb == b.rgb)
+#define neq(a,b) any(a.rgb != b.rgb)
+#endif
+
+#define P(x,y) tex_sample_direct(coord + u_texSize.zw * float2(x, y))
 
 float DistYCbCr(float4 pixA, float4 pixB)
 {
   const float3 w = float3(0.2627, 0.6780, 0.0593);
   const float scaleB = 0.5 / (1.0 - w.b);
   const float scaleR = 0.5 / (1.0 - w.r);
-  float3 diff = pixA.rgb - pixB.rgb;
-  float Y = dot(diff, w);
+  float4 diff = pixA - pixB;
+  float Y = dot(diff.rgb, w);
   float Cb = scaleB * (diff.b - Y);
   float Cr = scaleR * (diff.r - Y);
 
-  return sqrt( ((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr) );
+#ifdef BLEND_ALPHA
+  return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr) + (diff.a * diff.a));
+#else
+  return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr));
+#endif
 }
 
 bool IsPixEqual(const float4 pixA, const float4 pixB)
@@ -70,266 +88,213 @@ bool IsPixEqual(const float4 pixA, const float4 pixB)
   return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
 }
 
-bool IsBlendingNeeded(const int4 blend)
+float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale)
 {
-  return any(blend - BLEND_NONE);
+  float2 P0 = center - origin;
+  float2 proj = direction * (dot(P0, direction) / dot(direction, direction));
+  float2 distv = P0 - proj;
+  float2 orth = float2(-direction.y, direction.x);
+  float side = sign(dot(P0, orth));
+  float v = side * length(distv * scale);
+
+//  return step(0, v);
+  return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v);
 }
 
-//---------------------------------------
-// Input Pixel Mapping:    --|21|22|23|--
-//                         19|06|07|08|09
-//                         18|05|00|01|10
-//                         17|04|03|02|11
-//                         --|15|14|13|--
-//
-// Output Pixel Mapping: 20|21|22|23|24|25
-//                       19|06|07|08|09|26
-//                       18|05|00|01|10|27
-//                       17|04|03|02|11|28
-//                       16|15|14|13|12|29
-//                       35|34|33|32|31|30
-
-float4 tex_sample(float2 coord)
-{
-  float dx = u_texSize.z;
-  float dy = u_texSize.w;
-
-    //  A1 B1 C1
-  // A0 A  B  C C4
-  // D0 D  E  F F4
-  // G0 G  H  I I4
-    //  G5 H5 I5
+float4 tex_sample(float2 coord) {
+  //---------------------------------------
+  // Input Pixel Mapping:  -|x|x|x|-
+  //                       x|A|B|C|x
+  //                       x|D|E|F|x
+  //                       x|G|H|I|x
+  //                       -|x|x|x|-
+
+  float2 scale = u_texSize.zw / float2(ddx(coord.x), ddy(coord.y));
+  float2 pos = frac(coord * u_texSize.xy) - float2(0.5, 0.5);
+  float4 A = P(-1,-1);
+  float4 B = P( 0,-1);
+  float4 C = P( 1,-1);
+  float4 D = P(-1, 0);
+  float4 E = P( 0, 0);
+  float4 F = P( 1, 0);
+  float4 G = P(-1, 1);
+  float4 H = P( 0, 1);
+  float4 I = P( 1, 1);
+
+  // blendResult Mapping: x|y|
+  //                      w|z|
+  int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE);
 
-  float4 t1 = coord.xxxy + float4( -dx, 0.0, dx,-2.0*dy); // A1 B1 C1
-  float4 t2 = coord.xxxy + float4( -dx, 0.0, dx, -dy); // A B C
-  float4 t3 = coord.xxxy + float4( -dx, 0.0, dx, 0.0); // D E F
-  float4 t4 = coord.xxxy + float4( -dx, 0.0, dx, dy); // G H I
-  float4 t5 = coord.xxxy + float4( -dx, 0.0, dx, 2.0*dy); // G5 H5 I5
-  float4 t6 = coord.xyyy + float4(-2.0*dx,-dy, 0.0, dy); // A0 D0 G0
-  float4 t7 = coord.xyyy + float4( 2.0*dx,-dy, 0.0, dy); // C4 F4 I4
+  // Preprocess corners
+  // Pixel Tap Mapping: -|-|-|-|-
+  //                    -|-|B|C|-
+  //                    -|D|E|F|x
+  //                    -|G|H|I|x
+  //                    -|-|x|x|-
+  if (!((eq(E,F) && eq(H,I)) || (eq(E,H) && eq(F,I))))
+  {
+    float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(0,2), I) + DistYCbCr(I, P(2,0)) + (4.0 * DistYCbCr(H, F));
+    float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(2,1)) + (4.0 * DistYCbCr(E, I));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I;
+    blendResult.z = ((dist_H_F < dist_E_I) && neq(E,F) && neq(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
 
-  float2 f = frac(coord.xy * u_texSize.xy);
 
-  //---------------------------------------
-  // Input Pixel Mapping:    |21|22|23|
-  //                       19|06|07|08|09
-  //                       18|05|00|01|10
-  //                       17|04|03|02|11
-  //                         |15|14|13|
-
-  float4 src[25];
-
-  src[21] = premultiply_alpha(tex_sample_direct(t1.xw));
-  src[22] = premultiply_alpha(tex_sample_direct(t1.yw));
-  src[23] = premultiply_alpha(tex_sample_direct(t1.zw));
-  src[ 6] = premultiply_alpha(tex_sample_direct(t2.xw));
-  src[ 7] = premultiply_alpha(tex_sample_direct(t2.yw));
-  src[ 8] = premultiply_alpha(tex_sample_direct(t2.zw));
-  src[ 5] = premultiply_alpha(tex_sample_direct(t3.xw));
-  src[ 0] = premultiply_alpha(tex_sample_direct(t3.yw));
-  src[ 1] = premultiply_alpha(tex_sample_direct(t3.zw));
-  src[ 4] = premultiply_alpha(tex_sample_direct(t4.xw));
-  src[ 3] = premultiply_alpha(tex_sample_direct(t4.yw));
-  src[ 2] = premultiply_alpha(tex_sample_direct(t4.zw));
-  src[15] = premultiply_alpha(tex_sample_direct(t5.xw));
-  src[14] = premultiply_alpha(tex_sample_direct(t5.yw));
-  src[13] = premultiply_alpha(tex_sample_direct(t5.zw));
-  src[19] = premultiply_alpha(tex_sample_direct(t6.xy));
-  src[18] = premultiply_alpha(tex_sample_direct(t6.xz));
-  src[17] = premultiply_alpha(tex_sample_direct(t6.xw));
-  src[ 9] = premultiply_alpha(tex_sample_direct(t7.xy));
-  src[10] = premultiply_alpha(tex_sample_direct(t7.xz));
-  src[11] = premultiply_alpha(tex_sample_direct(t7.xw));
-
-  float v[9];
-  v[0] = reduce(src[0]);
-  v[1] = reduce(src[1]);
-  v[2] = reduce(src[2]);
-  v[3] = reduce(src[3]);
-  v[4] = reduce(src[4]);
-  v[5] = reduce(src[5]);
-  v[6] = reduce(src[6]);
-  v[7] = reduce(src[7]);
-  v[8] = reduce(src[8]);
-
-  int4 blendResult = BLEND_NONE;
+  // Pixel Tap Mapping: -|-|-|-|-
+  //                    -|A|B|-|-
+  //                    x|D|E|F|-
+  //                    x|G|H|I|-
+  //                    -|x|x|-|-
+  if (!((eq(D,E) && eq(G,H)) || (eq(D,G) && eq(E,H))))
+  {
+    float dist_G_E = DistYCbCr(P(-2,1)  , D) + DistYCbCr(D, B) + DistYCbCr(P(-1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E));
+    float dist_D_H = DistYCbCr(P(-2,0)  , G) + DistYCbCr(G, P(0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E;
+    blendResult.w = ((dist_G_E > dist_D_H) && neq(E,D) && neq(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+  }
 
-  // Preprocess corners
-  // Pixel Tap Mapping: --|--|--|--|--
-  //                    --|--|07|08|--
-  //                    --|05|00|01|10
-  //                    --|04|03|02|11
-  //                    --|--|14|13|--
-  // Corner (1, 1)
-  if ( ((v[0] == v[1] && v[3] == v[2]) || (v[0] == v[3] && v[1] == v[2])) == false)
+  // Pixel Tap Mapping: -|-|x|x|-
+  //                    -|A|B|C|x
+  //                    -|D|E|F|x
+  //                    -|-|H|I|-
+  //                    -|-|-|-|-
+  if (!((eq(B,C) && eq(E,F)) || (eq(B,E) && eq(C,F))))
   {
-    float dist_03_01 = DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + DistYCbCr(src[14], src[ 2]) + DistYCbCr(src[ 2], src[10]) + (4.0 * DistYCbCr(src[ 3], src[ 1]));
-    float dist_00_02 = DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[ 3], src[13]) + DistYCbCr(src[ 7], src[ 1]) + DistYCbCr(src[ 1], src[11]) + (4.0 * DistYCbCr(src[ 0], src[ 2]));
-    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_03_01) < dist_00_02;
-    blendResult[2] = ((dist_03_01 < dist_00_02) && (v[0] != v[1]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+    float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(2,-1)) + (4.0 * DistYCbCr(E, C));
+    float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(0,-2), C) + DistYCbCr(C, P(2,0)) + (4.0 * DistYCbCr(B, F));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C;
+    blendResult.y = ((dist_E_C > dist_B_F) && neq(E,B) && neq(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
   }
 
-  // Pixel Tap Mapping: --|--|--|--|--
-  //                    --|06|07|--|--
-  //                    18|05|00|01|--
-  //                    17|04|03|02|--
-  //                    --|15|14|--|--
-  // Corner (0, 1)
-  if ( ((v[5] == v[0] && v[4] == v[3]) || (v[5] == v[4] && v[0] == v[3])) == false)
+  // Pixel Tap Mapping: -|x|x|-|-
+  //                    x|A|B|C|-
+  //                    x|D|E|F|-
+  //                    -|G|H|-|-
+  //                    -|-|-|-|-
+  if (!((eq(A,B) && eq(D,E)) || (eq(A,D) && eq(B,E))))
   {
-    float dist_04_00 = DistYCbCr(src[17], src[ 5]) + DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[15], src[ 3]) + DistYCbCr(src[ 3], src[ 1]) + (4.0 * DistYCbCr(src[ 4], src[ 0]));
-    float dist_05_03 = DistYCbCr(src[18], src[ 4]) + DistYCbCr(src[ 4], src[14]) + DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + (4.0 * DistYCbCr(src[ 5], src[ 3]));
-    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_03) < dist_04_00;
-    blendResult[3] = ((dist_04_00 > dist_05_03) && (v[0] != v[5]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+    float dist_D_B = DistYCbCr(P(-2,0), A) + DistYCbCr(A, P(0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B));
+    float dist_A_E = DistYCbCr(P(-2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(-1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E));
+    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E;
+    blendResult.x = ((dist_D_B < dist_A_E) && neq(E,D) && neq(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
   }
 
-  // Pixel Tap Mapping: --|--|22|23|--
-  //                    --|06|07|08|09
-  //                    --|05|00|01|10
-  //                    --|--|03|02|--
-  //                    --|--|--|--|--
-  // Corner (1, 0)
-  if ( ((v[7] == v[8] && v[0] == v[1]) || (v[7] == v[0] && v[8] == v[1])) == false)
+  float4 res = premultiply_alpha(E);
+
+  // Pixel Tap Mapping: -|-|-|-|-
+  //                    -|-|B|C|-
+  //                    -|D|E|F|x
+  //                    -|G|H|I|x
+  //                    -|-|x|x|-
+  if(blendResult.z != BLEND_NONE)
   {
-    float dist_00_08 = DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[ 7], src[23]) + DistYCbCr(src[ 3], src[ 1]) + DistYCbCr(src[ 1], src[ 9]) + (4.0 * DistYCbCr(src[ 0], src[ 8]));
-    float dist_07_01 = DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + DistYCbCr(src[22], src[ 8]) + DistYCbCr(src[ 8], src[10]) + (4.0 * DistYCbCr(src[ 7], src[ 1]));
-    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_07_01) < dist_00_08;
-    blendResult[1] = ((dist_00_08 > dist_07_01) && (v[0] != v[7]) && (v[0] != v[1])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+    float dist_F_G = DistYCbCr(F, G);
+    float dist_H_C = DistYCbCr(H, C);
+    bool doLineBlend = (blendResult.z == BLEND_DOMINANT ||
+                !((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) ||
+                  (IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I))));
+
+    float2 origin = float2(0.0, 1.0 / sqrt(2.0));
+    float2 direction = float2(1.0, -1.0);
+    if(doLineBlend)
+    {
+      bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && neq(E,G) && neq(D,G);
+      bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && neq(E,C) && neq(B,C);
+      origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5);
+      direction.x += haveShallowLine? 1.0: 0.0;
+      direction.y -= haveSteepLine? 1.0: 0.0;
+    }
+
+    float4 blendPix = premultiply_alpha(lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H))));
+    res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
   }
 
-  // Pixel Tap Mapping: --|21|22|--|--
-  //                    19|06|07|08|--
-  //                    18|05|00|01|--
-  //                    --|04|03|--|--
-  //                    --|--|--|--|--
-  // Corner (0, 0)
-  if ( ((v[6] == v[7] && v[5] == v[0]) || (v[6] == v[5] && v[7] == v[0])) == false)
+  // Pixel Tap Mapping: -|-|-|-|-
+  //                    -|A|B|-|-
+  //                    x|D|E|F|-
+  //                    x|G|H|I|-
+  //                    -|x|x|-|-
+  if(blendResult.w != BLEND_NONE)
   {
-    float dist_05_07 = DistYCbCr(src[18], src[ 6]) + DistYCbCr(src[ 6], src[22]) + DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + (4.0 * DistYCbCr(src[ 5], src[ 7]));
-    float dist_06_00 = DistYCbCr(src[19], src[ 5]) + DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[21], src[ 7]) + DistYCbCr(src[ 7], src[ 1]) + (4.0 * DistYCbCr(src[ 6], src[ 0]));
-    bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_07) < dist_06_00;
-    blendResult[0] = ((dist_05_07 < dist_06_00) && (v[0] != v[5]) && (v[0] != v[7])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
+    float dist_H_A = DistYCbCr(H, A);
+    float dist_D_I = DistYCbCr(D, I);
+    bool doLineBlend = (blendResult.w == BLEND_DOMINANT ||
+                !((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) ||
+                  (IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G))));
+
+    float2 origin = float2(-1.0 / sqrt(2.0), 0.0);
+    float2 direction = float2(1.0, 1.0);
+    if(doLineBlend)
+    {
+      bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && neq(E,A) && neq(B,A);
+      bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && neq(E,I) && neq(F,I);
+      origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0);
+      direction.y += haveShallowLine? 1.0: 0.0;
+      direction.x += haveSteepLine? 1.0: 0.0;
+    }
+    origin = origin;
+    direction = direction;
+
+    float4 blendPix = premultiply_alpha(lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H))));
+    res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
   }
 
-  float4 dst[16];
-  dst[ 0] = src[0];
-  dst[ 1] = src[0];
-  dst[ 2] = src[0];
-  dst[ 3] = src[0];
-  dst[ 4] = src[0];
-  dst[ 5] = src[0];
-  dst[ 6] = src[0];
-  dst[ 7] = src[0];
-  dst[ 8] = src[0];
-  dst[ 9] = src[0];
-  dst[10] = src[0];
-  dst[11] = src[0];
-  dst[12] = src[0];
-  dst[13] = src[0];
-  dst[14] = src[0];
-  dst[15] = src[0];
-
-  // Scale pixel
-  if (IsBlendingNeeded(blendResult) == true)
+  // Pixel Tap Mapping: -|-|x|x|-
+  //                    -|A|B|C|x
+  //                    -|D|E|F|x
+  //                    -|-|H|I|-
+  //                    -|-|-|-|-
+  if(blendResult.y != BLEND_NONE)
   {
-    float dist_01_04 = DistYCbCr(src[1], src[4]);
-    float dist_03_08 = DistYCbCr(src[3], src[8]);
-    bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[4]) && (v[5] != v[4]);
-    bool haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[8]) && (v[7] != v[8]);
-    bool needBlend = (blendResult[2] != BLEND_NONE);
-    bool doLineBlend = (  blendResult[2] >= BLEND_DOMINANT ||
-               ((blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
-               (blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
-               (IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[0], src[2]) == false) ) == false );
-
-    float4 blendPix = ( DistYCbCr(src[0], src[1]) <= DistYCbCr(src[0], src[3]) ) ? src[1] : src[3];
-    dst[ 2] = lerp(dst[ 2], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
-    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
-    dst[10] = lerp(dst[10], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
-    dst[11] = lerp(dst[11], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[12] = lerp(dst[12], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
-    dst[13] = lerp(dst[13], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[14] = lerp(dst[14], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
-    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
-
-    dist_01_04 = DistYCbCr(src[7], src[2]);
-    dist_03_08 = DistYCbCr(src[1], src[6]);
-    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[2]) && (v[3] != v[2]);
-    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[6]) && (v[5] != v[6]);
-    needBlend = (blendResult[1] != BLEND_NONE);
-    doLineBlend = (  blendResult[1] >= BLEND_DOMINANT ||
-            !((blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
-            (blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
-            (IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && !IsPixEqual(src[0], src[8])) ) );
-
-    blendPix = ( DistYCbCr(src[0], src[7]) <= DistYCbCr(src[0], src[1]) ) ? src[7] : src[1];
-    dst[ 1] = lerp(dst[ 1], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
-    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
-    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
-    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
-    dst[10] = lerp(dst[10], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[11] = lerp(dst[11], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
-    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
-
-    dist_01_04 = DistYCbCr(src[5], src[8]);
-    dist_03_08 = DistYCbCr(src[7], src[4]);
-    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[8]) && (v[1] != v[8]);
-    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[4]) && (v[3] != v[4]);
-    needBlend = (blendResult[0] != BLEND_NONE);
-    doLineBlend = (  blendResult[0] >= BLEND_DOMINANT ||
-            !((blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
-            (blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
-            (IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && !IsPixEqual(src[0], src[6])) ) );
-
-    blendPix = ( DistYCbCr(src[0], src[5]) <= DistYCbCr(src[0], src[7]) ) ? src[5] : src[7];
-    dst[ 0] = lerp(dst[ 0], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
-    dst[15] = lerp(dst[15], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
-    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
-    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
-    dst[ 7] = lerp(dst[ 7], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[ 8] = lerp(dst[ 8], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
-    dst[ 9] = lerp(dst[ 9], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
-
-
-    dist_01_04 = DistYCbCr(src[3], src[6]);
-    dist_03_08 = DistYCbCr(src[5], src[2]);
-    haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[6]) && (v[7] != v[6]);
-    haveSteepLine   = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[2]) && (v[1] != v[2]);
-    needBlend = (blendResult[3] != BLEND_NONE);
-    doLineBlend = (  blendResult[3] >= BLEND_DOMINANT ||
-            !((blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
-            (blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
-            (IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && !IsPixEqual(src[0], src[4])) ) );
-
-    blendPix = ( DistYCbCr(src[0], src[3]) <= DistYCbCr(src[0], src[5]) ) ? src[3] : src[5];
-    dst[ 3] = lerp(dst[ 3], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
-    dst[12] = lerp(dst[12], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
-    dst[13] = lerp(dst[13], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
-    dst[14] = lerp(dst[14], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[15] = lerp(dst[15], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
-    dst[ 4] = lerp(dst[ 4], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
-    dst[ 5] = lerp(dst[ 5], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
-    dst[ 6] = lerp(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
+    float dist_B_I = DistYCbCr(B, I);
+    float dist_F_A = DistYCbCr(F, A);
+    bool doLineBlend = (blendResult.y == BLEND_DOMINANT ||
+                !((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) ||
+                  (IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C))));
+
+    float2 origin = float2(1.0 / sqrt(2.0), 0.0);
+    float2 direction = float2(-1.0, -1.0);
+
+    if(doLineBlend)
+    {
+      bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && neq(E,I) && neq(H,I);
+      bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && neq(E,A) && neq(D,A);
+      origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0);
+      direction.y -= haveShallowLine? 1.0: 0.0;
+      direction.x -= haveSteepLine? 1.0: 0.0;
+    }
+
+    float4 blendPix = premultiply_alpha(lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F))));
+    res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
   }
 
-  // select output pixel
-  float4 res = lerp(lerp(lerp(lerp(dst[ 6], dst[ 7], step(0.25, f.x)),
-                              lerp(dst[ 8], dst[ 9], step(0.75, f.x)),
-                              step(0.50, f.x)),
-                         lerp(lerp(dst[ 5], dst[ 0], step(0.25, f.x)),
-                              lerp(dst[ 1], dst[10], step(0.75, f.x)),
-                              step(0.50, f.x)),
-                         step(0.25, f.y)),
-                    lerp(lerp(lerp(dst[ 4], dst[ 3], step(0.25, f.x)),
-                              lerp(dst[ 2], dst[11], step(0.75, f.x)),
-                              step(0.50, f.x)),
-                         lerp(lerp(dst[15], dst[14], step(0.25, f.x)),
-                              lerp(dst[13], dst[12], step(0.75, f.x)),
-                              step(0.50, f.x)),
-                         step(0.75, f.y)),
-                    step(0.50, f.y));
+  // Pixel Tap Mapping: -|x|x|-|-
+  //                    x|A|B|C|-
+  //                    x|D|E|F|-
+  //                    -|G|H|-|-
+  //                    -|-|-|-|-
+  if(blendResult.x != BLEND_NONE)
+  {
+    float dist_D_C = DistYCbCr(D, C);
+    float dist_B_G = DistYCbCr(B, G);
+    bool doLineBlend = (blendResult.x == BLEND_DOMINANT ||
+                !((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) ||
+                  (IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A))));
+
+    float2 origin = float2(0.0, -1.0 / sqrt(2.0));
+    float2 direction = float2(-1.0, 1.0);
+    if(doLineBlend)
+    {
+      bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && neq(E,C) && neq(F,C);
+      bool haveSteepLine  = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && neq(E,G) && neq(H,G);
+      origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5);
+      direction.x -= haveShallowLine? 1.0: 0.0;
+      direction.y += haveSteepLine? 1.0: 0.0;
+    }
+
+    float4 blendPix = premultiply_alpha(lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D))));
+    res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
+  }
 
   return postdivide_alpha(res);
-};
+}
diff --git a/libretro/libretro.cpp b/libretro/libretro.cpp
index 5332f805da64..a76e8a4c4461 100644
--- a/libretro/libretro.cpp
+++ b/libretro/libretro.cpp
@@ -175,7 +175,7 @@ static RetroOption<int> ppsspp_button_preference("ppsspp_button_preference", "Co
 static RetroOption<bool> ppsspp_fast_memory("ppsspp_fast_memory", "Fast Memory (Speedhack)", true);
 static RetroOption<bool> ppsspp_block_transfer_gpu("ppsspp_block_transfer_gpu", "Block Transfer GPU", true);
 static RetroOption<int> ppsspp_texture_scaling_level("ppsspp_texture_scaling_level", "Texture Scaling Level", { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "0", 0 } });
-static RetroOption<int> ppsspp_texture_scaling_type("ppsspp_texture_scaling_type", "Texture Scaling Type", { { "xbrz", TextureScalerCommon::XBRZ }, { "hybrid", TextureScalerCommon::HYBRID }, { "bicubic", TextureScalerCommon::BICUBIC }, { "hybrid_bicubic", TextureScalerCommon::HYBRID_BICUBIC }, { "XBR", TextureScalerCommon::XBR }, { "SABR", TextureScalerCommon::SABR }, { "gaussian", TextureScalerCommon::GAUSSIAN }, { "cosine", TextureScalerCommon::COSINE } });
+static RetroOption<int> ppsspp_texture_scaling_type("ppsspp_texture_scaling_type", "Texture Scaling Type", { { "xbrz", TextureScalerCommon::XBRZ }, { "hybrid", TextureScalerCommon::HYBRID }, { "bicubic", TextureScalerCommon::BICUBIC }, { "hybrid_bicubic", TextureScalerCommon::HYBRID_BICUBIC }, { "4xBRZ", TextureScalerCommon::_4XBRZ }, { "XBR", TextureScalerCommon::XBR }, { "SABR", TextureScalerCommon::SABR }, { "gaussian", TextureScalerCommon::GAUSSIAN }, { "cosine", TextureScalerCommon::COSINE } });
 static RetroOption<bool> ppsspp_texture_scaling_realtime("ppsspp_texture_scaling_realtime", "Realtime Texture Scaling", false);
 static RetroOption<int> ppsspp_texture_anisotropic_filtering("ppsspp_texture_anisotropic_filtering", "Anisotropic Filtering", { "off", "1x", "2x", "4x", "8x", "16x" });
 static RetroOption<bool> ppsspp_texture_deposterize("ppsspp_texture_deposterize", "Texture Deposterize", false);