[Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 #7779

LMeteorYu0330 · 2023-04-10T10:44:22Z

I've observed a significant performance regression from 1.4.1 to 1.5.0 on an implicit 3D FEM.

My own code base is somewhat too large to paste here. I have witnessed the same regression on the following FEM code snippet from the Taichi official repo. It seems that the matmul_cell function seems to be more affected by the upgrade.

import argparse
import numpy as np
import taichi as ti


ti.init(arch=ti.gpu)
E, nu = 5e4, 0.0
mu, la = E / (2 * (1 + nu)), E * nu / ((1 + nu) * (1 - 2 * nu))  # lambda = 0
density = 1000.0
dt = 1e-2

n_cube = np.array([5] * 3)
n_verts = np.product(n_cube)
n_cells = 5 * np.product(n_cube - 1)
dx = 1 / (n_cube.max() - 1)

vertices = ti.Vector.field(4, dtype=ti.i32, shape=n_cells)

x = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
ox = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
v = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
f = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
mul_ans = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
m = ti.field(dtype=ti.f32, shape=n_verts)

n_cells = (n_cube - 1).prod() * 5
B = ti.Matrix.field(3, 3, dtype=ti.f32, shape=n_cells)
W = ti.field(dtype=ti.f32, shape=n_cells)


@ti.func
def i2p(I):
    return (I.x * n_cube[1] + I.y) * n_cube[2] + I.z


@ti.func
def set_element(e, I, verts):
    for i in ti.static(range(3 + 1)):
        vertices[e][i] = i2p(I + (([verts[i] >> k for k in range(3)] ^ I) & 1))


@ti.kernel
def get_vertices():
    '''
    This kernel partitions the cube into tetrahedrons.
    Each unit cube is divided into 5 tetrahedrons.
    '''
    for I in ti.grouped(ti.ndrange(*(n_cube - 1))):
        e = ((I.x * (n_cube[1] - 1) + I.y) * (n_cube[2] - 1) + I.z) * 5
        for i, j in ti.static(enumerate([0, 3, 5, 6])):
            set_element(e + i, I, (j, j ^ 1, j ^ 2, j ^ 4))
        set_element(e + 4, I, (1, 2, 4, 7))
    for I in ti.grouped(ti.ndrange(*(n_cube))):
        ox[i2p(I)] = I * dx


@ti.func
def Ds(verts):
    return ti.Matrix.cols([x[verts[i]] - x[verts[3]] for i in range(3)])


@ti.func
def ssvd(F):
    U, sig, V = ti.svd(F)
    if U.determinant() < 0:
        for i in ti.static(range(3)):
            U[i, 2] *= -1
        sig[2, 2] = -sig[2, 2]
    if V.determinant() < 0:
        for i in ti.static(range(3)):
            V[i, 2] *= -1
        sig[2, 2] = -sig[2, 2]
    return U, sig, V


@ti.func
def get_force_func(c, verts):
    F = Ds(verts) @ B[c]
    P = ti.Matrix.zero(ti.f32, 3, 3)
    U, sig, V = ssvd(F)
    P = 2 * mu * (F - U @ V.transpose())
    H = -W[c] * P @ B[c].transpose()
    for i in ti.static(range(3)):
        force = ti.Vector([H[j, i] for j in range(3)])
        f[verts[i]] += force
        f[verts[3]] -= force


@ti.kernel
def get_force():
    for c in vertices:
        get_force_func(c, vertices[c])
    for u in f:
        f[u].y -= 9.8 * m[u]


@ti.kernel
def matmul_cell(ret: ti.template(), vel: ti.template()):
    for i in ret:
        ret[i] = vel[i] * m[i]
    for c in vertices:
        verts = vertices[c]
        W_c = W[c]
        B_c = B[c]
        for u in range(4):
            for d in range(3):
                dD = ti.Matrix.zero(ti.f32, 3, 3)
                if u == 3:
                    for j in range(3):
                        dD[d, j] = -1
                else:
                    dD[d, u] = 1
                dF = dD @ B_c
                dP = 2.0 * mu * dF
                dH = -W_c * dP @ B_c.transpose()
                for i in range(3):
                    for j in range(3):
                        tmp = (vel[verts[i]][j] - vel[verts[3]][j])
                        ret[verts[u]][d] += -dt ** 2 * dH[j, i] * tmp


@ti.kernel
def add(ans: ti.template(), a: ti.template(), k: ti.f32, b: ti.template()):
    for i in ans:
        ans[i] = a[i] + k * b[i]


@ti.kernel
def dot(a: ti.template(), b: ti.template()) -> ti.f32:
    ans = 0.0
    for i in a:
        ans += a[i].dot(b[i])
    return ans


b = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
r0 = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)
p0 = ti.Vector.field(3, dtype=ti.f32, shape=n_verts)


@ti.kernel
def get_b():
    for i in b:
        b[i] = m[i] * v[i] + dt * f[i]


def cg():
    def mul(x):
        matmul_cell(mul_ans, x)
        return mul_ans

    get_force()
    get_b()
    mul(v)
    add(r0, b, -1, mul(v))

    d = p0
    d.copy_from(r0)
    r_2 = dot(r0, r0)
    n_iter = 50
    epsilon = 1e-6
    r_2_init = r_2
    r_2_new = r_2
    for iter in range(n_iter):
        q = mul(d)
        alpha = r_2_new / dot(d, q)
        add(v, v, alpha, d)
        add(r0, r0, -alpha, q)
        r_2 = r_2_new
        r_2_new = dot(r0, r0)
        if r_2_new <= r_2_init * epsilon ** 2:
            break
        beta = r_2_new / r_2
        add(d, r0, beta, d)
    f.fill(0)
    add(x, x, dt, v)


@ti.kernel
def advect():
    for p in x:
        v[p] += dt * (f[p] / m[p])
        x[p] += dt * v[p]
        f[p] = ti.Vector([0, 0, 0])


@ti.kernel
def init():
    for u in x:
        x[u] = ox[u]
        v[u] = [0.0] * 3
        f[u] = [0.0] * 3
        m[u] = 0.0
    for c in vertices:
        F = Ds(vertices[c])
        B[c] = F.inverse()
        W[c] = ti.abs(F.determinant()) / 6
        for i in range(4):
            m[vertices[c][i]] += W[c] / 4 * density
    for u in x:
        x[u].y += 1.0


@ti.kernel
def floor_bound():
    for u in x:
        if x[u].y < 0:
            x[u].y = 0
            if v[u].y < 0:
                v[u].y = 0


@ti.func
def check(u):
    ans = 0
    rest = u
    for i in ti.static(range(3)):
        k = rest % n_cube[2 - i]
        rest = rest // n_cube[2 - i]
        if k == 0: ans |= (1 << (i * 2))
        if k == n_cube[2 - i] - 1: ans |= (1 << (i * 2 + 1))
    return ans


su = 0
for i in range(3):
    su += (n_cube[i] - 1) * (n_cube[(i + 1) % 3] - 1)
indices = ti.field(ti.i32, shape=2 * su * 2 * 3)


@ti.kernel
def get_indices():
    # calculate all the meshes on surface
    cnt = 0
    for c in vertices:
        if c % 5 != 4:
            for i in ti.static([0, 2, 3]):
                verts = [vertices[c][(i + j) % 4] for j in range(3)]
                sum = check(verts[0]) & check(verts[1]) & check(verts[2])
                if sum:
                    m = ti.atomic_add(cnt, 1)
                    det = ti.Matrix.rows([
                        x[verts[i]] - [0.5, 1.5, 0.5] for i in range(3)
                    ]).determinant()
                    if det < 0:
                        tmp = verts[1]
                        verts[1] = verts[2]
                        verts[2] = tmp
                    indices[m * 3] = verts[0]
                    indices[m * 3 + 1] = verts[1]
                    indices[m * 3 + 2] = verts[2]


def substep():
    for _ in range(1):
        cg()
    floor_bound()


if __name__ == '__main__':
    get_vertices()
    init()
    get_indices()

    res = (800, 600)
    window = ti.ui.Window("Implicit FEM", res, vsync=False)

    frame_id = 0
    canvas = window.get_canvas()
    scene = ti.ui.Scene()
    camera = ti.ui.Camera()
    camera.position(2.0, 2.0, 3.95)
    camera.lookat(0.5, 0.5, 0.5)
    camera.fov(55)


    def render():
        camera.track_user_inputs(window,
                                 movement_speed=0.03,
                                 hold_key=ti.ui.RMB)
        scene.set_camera(camera)

        scene.ambient_light((0.1,) * 3)

        scene.point_light(pos=(0.5, 10.0, 0.5), color=(0.5, 0.5, 0.5))
        scene.point_light(pos=(10.0, 10.0, 10.0), color=(0.5, 0.5, 0.5))

        scene.mesh(x, indices, color=(0.73, 0.33, 0.23))

        canvas.scene(scene)


    while window.running:
        frame_id += 1
        frame_id = frame_id % 256
        substep()
        if window.is_pressed('r'):
            init()
        if window.is_pressed(ti.GUI.ESCAPE):
            break

        render()

        window.show()

The text was updated successfully, but these errors were encountered:

turbo0628 · 2023-04-10T12:33:28Z

Thanks for reporting this performance issue! We will investigate into this soon :)

zxlbig · 2023-04-11T09:40:30Z

I couldn't reproduce your issue on my local environment. May I ask what your local environment is and for the given example, can you give me the printed profiling message with 'ti.init(arch=ti.gpu, kernel_profiler=True)' for 1.4.1 and 1.5.0? @LMeteorYu0330

LMeteorYu0330 · 2023-04-12T08:23:50Z

My local environment is Win10+Python 3.8.
Here are partial screenshots of the profiling. @zxlbig

zxlbig · 2023-04-14T10:07:49Z

I'm sorry, in the example you provided, the difference in performance between matmul_cell implementations is only 0.01ms, and there is no serious performance degradation. We also did not reproduce it locally.

LMeteorYu0330 added the question Question on using Taichi label Apr 10, 2023

taichi-gardener added this to Taichi Lang Apr 10, 2023

github-project-automation bot moved this to Untriaged in Taichi Lang Apr 10, 2023

turbo0628 changed the title ~~从1.4.1更新到1.5.0后同一份代码帧率下降严重~~ [Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 Apr 10, 2023

lin-hitonami assigned zxlbig Apr 14, 2023

lin-hitonami moved this from Untriaged to Todo in Taichi Lang Apr 14, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 #7779

[Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 #7779

LMeteorYu0330 commented Apr 10, 2023 •

edited by turbo0628

Loading

turbo0628 commented Apr 10, 2023

zxlbig commented Apr 11, 2023 •

edited

Loading

LMeteorYu0330 commented Apr 12, 2023 •

edited

Loading

zxlbig commented Apr 14, 2023

[Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 #7779

[Perf] 3D implicit FEM performance regression upgrading from 1.4.1 to 1.5.0 #7779

Comments

LMeteorYu0330 commented Apr 10, 2023 • edited by turbo0628 Loading

turbo0628 commented Apr 10, 2023

zxlbig commented Apr 11, 2023 • edited Loading

LMeteorYu0330 commented Apr 12, 2023 • edited Loading

zxlbig commented Apr 14, 2023

LMeteorYu0330 commented Apr 10, 2023 •

edited by turbo0628

Loading

zxlbig commented Apr 11, 2023 •

edited

Loading

LMeteorYu0330 commented Apr 12, 2023 •

edited

Loading