昇腾CANN cann-recipes-spatial-intelligence：空间智能场景的 NPU 推理实战

空间智能是 AI 的下一个前沿——不只是看懂图片里的内容，还要理解三维空间中的物体位置、空间关系、深度感知。cann-recipes-spatial-intelligence 覆盖三大应用：3D 目标检测（自动驾驶/机器人）、NeRF/3DGS（神经辐射场/高斯泼溅，新型三维重建）、SLAM（同步定位与地图构建）。三个方向的共性：输入是图像或视频，输出是空间中的几何信息。

雨季666

11人浏览 · 2026-05-22 12:15:04

雨季666 · 2026-05-22 12:15:04 发布

三类空间智能任务

任务	输入	输出	模型	核心算子
3D 目标检测	多视角相机图像	目标 3D 边界框	BEVFormer / PointPillars	视角变换、3D 卷积
NeRF 神经辐射场	多视角图像 + 相机位姿	场景隐式表示	NeRF / 3D Gaussian Splatting	体密度查询、MipNeRF360
SLAM	连续帧图像 + IMU	相机轨迹 + 3D 地图	ORB-SLAM3 / Direct SLAM	光流、特征匹配、BA

三类任务在 NPU 上的算子需求完全不同——3D 检测依赖大量矩阵乘和 BEV 视角变换，NeRF 依赖体积渲染（Ray Marching），SLAM 依赖特征匹配和几何优化。

3D 目标检测：BEVFormer 的视角变换

BEV（Bird Eye View，鸟瞰图）感知是自动驾驶的核心——把前视相机的图像转成俯视图上的目标检测。

// cann-recipes-spatial-intelligence/kernels/bev_transform.cpp

// BEVFormer 的核心：Spatial Cross-Attention
// 把图像特征投影到 BEV 空间，然后做 self-attention

__aicore__ void SpatialCrossAttention(
    GlobalTensor<float>& bev_query,      // [B, H_bev, W_bev, C]
    GlobalTensor<float>& image_features,  // [B, N_cam, C, H_img, W_img]
    GlobalTensor<float>& camera_K,      // 内参矩阵 [B, N_cam, 3, 3]
    GlobalTensor<float>& camera_R,      // 外参矩阵 [B, N_cam, 3, 3]
    GlobalTensor<float>& camera_T,      // 外参平移向量 [B, N_cam, 3]
    GlobalTensor<float>& grid,          // BEV 网格坐标 [H_bev, W_bev, 3]
    int B, int N_cam, int C, int H_bev, int W_bev,
    int H_img, int W_img
) {
    // BEV 视角变换：从 BEV 坐标反投影到图像坐标
    // 逻辑：
    // BEV 空间的一个点 (x, y, z=0) → 3D 世界坐标 (x, y, 0)
    // 3D 世界坐标 → 相机坐标：T_cam = R^-1 × (P_world - T)
    // 相机坐标 → 像素坐标：p = K × T_cam

    for (int b = 0; b < B; b++) {
        for (int h = 0; h < H_bev; h++) {
            for (int w = 0; w < W_bev; w++) {
                // 取 BEV 网格点
                float bev_x = grid[h * W_bev * 3 + w * 3 + 0];
                float bev_y = grid[h * W_bev * 3 + w * 3 + 1];
                float bev_z = 0.0f;

                // 遍历所有相机
                LocalTensor<float> attended_features(N_cam * C);

                for (int cam = 0; cam < N_cam; cam++) {
                    // 3D 点（BEV → 世界 → 相机 → 像素）
                    // Cube 单元做矩阵乘
                    LocalTensor<float> world_pt(3);  // [x, y, 0]
                    world_pt[0] = bev_x;
                    world_pt[1] = bev_y;
                    world_pt[2] = bev_z;

                    // 逆外参变换：相机坐标 = R^T × (世界坐标 - T)
                    // R^T 是旋转矩阵的逆（因为 R 是正交的）
                    LocalTensor<float> cam_pt(3);
                    MatMul(cam_pt, camera_R[b * N_cam + cam], world_pt);  // R^T × P_world
                    cam_pt[0] -= camera_T[b * N_cam + cam][0];
                    cam_pt[1] -= camera_T[b * N_cam + cam][1];
                    cam_pt[2] -= camera_T[b * N_cam + cam][2];

                    // 检查深度（相机朝向方向）
                    if (cam_pt[2] <= 0.0f) continue;  // 深度为负，跳过

                    // 内参变换：像素坐标 = K × 相机坐标
                    LocalTensor<float> pixel_pt(3);
                    MatMul(pixel_pt, camera_K[b * N_cam + cam], cam_pt);

                    float u = pixel_pt[0] / pixel_pt[2];
                    float v = pixel_pt[1] / pixel_pt[2];

                    // 检查是否在图像范围内
                    if (u < 0 || u >= W_img || v < 0 || v >= H_img) continue;

                    // 双线性插值采样图像特征
                    float feat_val = BilinearSample(image_features[b][cam], u, v);
                    attended_features[cam * C : (cam+1) * C] = feat_val;
                }

                // 对所有相机的特征做加权平均
                bev_query[b * H_bev * W_bev * C + h * W_bev * C + w * C]
                    = Mean(attended_features, N_cam);
            }
        }
    }
}

BEV 视角变换的计算量随相机数量线性增长——6 个相机就是 6 次矩阵乘和 6 次双线性采样。CUBE 单元并行处理多个相机的投影，Vector 单元处理双线性插值。

NeRF 的体积渲染

NeRF（Neural Radiance Fields）用隐式表示描述三维场景——给定一个 3D 坐标 (x, y, z) 和观察方向 (θ, φ)，输出颜色 (R, G, B) 和体密度 σ。

体积渲染（Volumetric Rendering）沿着射线采样点，累积颜色和密度：

// cann-recipes-spatial-intelligence/kernels/nerf_rendering.cpp

// 体积渲染：沿射线采样 N_fine 个点，计算颜色和密度

__aicore__ void NerfVolumeRender(
    GlobalTensor<float>& rgb_map,         // [H, W, 3] 渲染图像
    GlobalTensor<float>& depth_map,        // [H, W] 深度图
    GlobalTensor<float>& ray_origins,     // [H, W, 3] 射线起点
    GlobalTensor<float>& ray_dirs,         // [H, W, 3] 射线方向（归一化）
    GlobalTensor<NeRFNetwork>& network,   // NeRF 网络权重
    int H, int W, int N_samples           // 采样点数
) {
    const float t_near = 0.1f;     // 近平面
    const float t_far = 10.0f;     // 远平面

    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            int ray_idx = y * W + x;

            float3 orig = LoadFloat3(ray_origins, ray_idx);
            float3 dir = LoadFloat3(ray_dirs, ray_idx);

            // 沿射线均匀采样 N 个点
            float t_vals[N_samples];
            for (int i = 0; i < N_samples; i++) {
                float t = t_near + (t_far - t_near) * float(i) / float(N_samples - 1);
                // 添加细粒度扰动（分层采样，降低方差）
                t += (hash(ray_idx * N_samples + i) - 0.5f) * (t_far - t_near) / float(N_samples);
                t_vals[i] = clamp(t, t_near, t_far);
            }

            // 累积颜色和密度
            float3 rgb = {0, 0, 0};
            float transmittance = 1.0f;
            float depth = 0.0f;

            for (int i = 0; i < N_samples; i++) {
                // 射线上的 3D 采样点
                float px = orig.x + dir.x * t_vals[i];
                float py = orig.y + dir.y * t_vals[i];
                float pz = orig.z + dir.z * t_vals[i];

                // NeRF MLP 查询：输入 (x, y, z, θ, φ)，输出 (R, G, B, σ)
                float3 rgb_i;
                float sigma_i;
                network.Query(px, py, pz, dir, rgb_i, sigma_i);

                // Delta = t_{i+1} - t_i
                float delta = (i == N_samples - 1)
                    ? t_far - t_vals[i]
                    : t_vals[i+1] - t_vals[i];

                // 透射率衰减因子
                float alpha = 1.0f - expf(-sigma_i * delta);
                float weight = alpha * transmittance;

                // 累积颜色
                rgb.x += weight * rgb_i.x;
                rgb.y += weight * rgb_i.y;
                rgb.z += weight * rgb_i.z;

                // 累积深度（加权平均）
                depth += weight * t_vals[i];

                // 更新透射率
                transmittance *= (1.0f - alpha);

                if (transmittance < 1e-4f) break;  // 光线基本被遮挡，提前终止
            }

            StoreFloat3(rgb_map, ray_idx, rgb);
            depth_map[ray_idx] = depth;
        }
    }
}

NeRF 的计算瓶颈在 MLP 查询（每个采样点一次）——64 个采样点 × 100 万条射线 = 6400 万次 MLP 调用。每条射线的采样点之间没有数据依赖，适合 NPU 的多核并行。

SLAM 的光流和特征匹配

SLAM（同步定位与地图构建）实时估计相机运动和三维环境结构。关键步骤：光流（跟踪相邻帧的像素运动）和特征匹配（找两张图之间的对应关系）。

// cann-recipes-spatial-intelligence/kernels/slam_optical_flow.cpp

// 光流计算：Lucas-Kanade 算法的 NPU 实现
// 假设邻域内所有像素有相同的运动向量

__aicore__ void OpticalFlowLK(
    GlobalTensor<float>& flow_u,         // 光流水平分量 [H, W]
    GlobalTensor<float>& flow_v,         // 光流垂直分量 [H, W]
    GlobalTensor<uchar>& img1,           // 第 1 帧图像
    GlobalTensor<uchar>& img2,           // 第 2 帧图像
    int H, int W,
    int window_size = 5                  // Lucas-Kanade 窗口大小
) {
    int half_win = window_size / 2;

    for (int y = half_win; y < H - half_win; y++) {
        for (int x = half_win; x < W - half_win; x++) {
            // 构建局部邻域的 Ix, Iy, It 矩阵
            // Ix: 图像 x 方向梯度
            // Iy: 图像 y 方向梯度
            // It: 帧间时间梯度

            float A[4] = {0, 0, 0, 0};  // [2x2] 矩阵 A^T A
            float B[2] = {0, 0};         // [2x1] 向量 A^T B

            for (int wy = -half_win; wy <= half_win; wy++) {
                for (int wx = -half_win; wx <= half_win; wx++) {
                    int x1 = x + wx, y1 = y + wy;
                    int x2 = x1 + 1, y2 = y1;  // x+1 方向
                    int x3 = x1, y3 = y1 + 1;  // y+1 方向

                    // Sobel 算子计算 Ix 和 Iy
                    float Ix = (float(img2[y1][x2]) - float(img2[y1][x1])) / 2.0f;
                    float Iy = (float(img2[y3][x1]) - float(img2[y1][x1])) / 2.0f;

                    // 时间梯度 It
                    float It = float(img2[y1][x1]) - float(img1[y1][x1]);

                    A[0] += Ix * Ix;
                    A[1] += Ix * Iy;
                    A[2] += Ix * Iy;
                    A[3] += Iy * Iy;
                    B[0] += -Ix * It;
                    B[1] += -Iy * It;
                }
            }

            // 解 2×2 线性方程组 A × [u; v] = B
            // [A[0]  A[1]] [u] = [B[0]]
            // [A[2]  A[3]] [v] = [B[1]]
            float det = A[0]*A[3] - A[1]*A[2];
            if (fabs(det) > 1e-6f) {
                flow_u[y * W + x] = (A[3]*B[0] - A[1]*B[1]) / det;
                flow_v[y * W + x] = (-A[2]*B[0] + A[0]*B[1]) / det;
            } else {
                flow_u[y * W + x] = 0.0f;
                flow_v[y * W + x] = 0.0f;
            }
        }
    }
}

光流计算的核心是 2×2 矩阵的求逆——Vector 单元的逐元素操作在这里非常高效。窗口内每个像素参与一次累加，窗口滑动过程可以复用中间结果。

踩坑一：NeRF 的射线采样在长序列上显存爆炸

NeRF 渲染一张图需要沿每条射线采样 N 个点。1920×1080 的图 × 64 个采样点 = 1.33 亿个采样点，每个采样点调用一次 MLP（查询颜色和密度）。

错误：一次性把所有采样点送进 MLP。

// 一次性处理所有射线和所有采样点
// 1.33 亿个采样点全部加载到 HBM
// MLP 查询需要同时保存输入和中间激活值
// HBM 占用 = 1.33亿 × (输入3D坐标3×float + MLP中间层~10×float) × 4B ≈ 69 GB
// 单卡 HBM 只有 64 GB，还要加载网络权重 → 爆显存

正确做法：按 batch 分批渲染，同 batch 的射线共享 MLP 的中间激活值。

// 分批渲染：每批处理 4096 条射线
// 4096 × 64 = 262144 个采样点
// HBM 占用 ≈ 262144 × 40 × 4B = 42 MB
// 完全适配单卡显存

const int RAYS_PER_BATCH = 4096;
int num_rays = H * W;
int num_batches = (num_rays + RAYS_PER_BATCH - 1) / RAYS_PER_BATCH;

for (int batch = 0; batch < num_batches; batch++) {
    int start = batch * RAYS_PER_BATCH;
    int end = min(start + RAYS_PER_BATCH, num_rays);

    // 只加载当前 batch 的射线数据
    LoadRayBatch(ray_origins, ray_dirs, start, end);

    // 渲染当前 batch
    RenderBatch(output_rgb, output_depth, start, end);
}

踩坑二：SLAM 的光流在低纹理区域失效

Lucas-Kanade 光流假设窗口内所有像素有相同运动——这个假设在低纹理区域（白墙、天空）完全失效：图像梯度为零（Ix = Iy = 0），矩阵 A 是奇异的，方程无解。

修复：在低纹理区域跳过光流计算。

// 先检测图像的梯度幅值
// 梯度幅值小于阈值 → 低纹理区域 → 光流不可靠

float grad_mag = sqrt(A[0] + A[3]);  // = Ix² + Iy²

if (grad_mag < 1e-4f) {
    // 低纹理区域：设置光流为 0 或用附近像素的外插值
    flow_u[y * W + x] = 0.0f;
    flow_v[y * W + x] = 0.0f;
    continue;
}

// 只在纹理丰富的区域计算光流
// 非极大值抑制：保留响应最强的点

踩坑三：3D 检测的相机外参在线标定误差

BEVFormer 的性能依赖相机外参（R, T）的标定精度。实际车载环境中，相机安装位置随时间漂移（热胀冷缩、震动），旧的外参逐渐失效。

症状：BEV 检测的远处目标出现系统性偏移——不是随机误差，而是固定方向的偏差。

修复：在线外参优化（Online Extrinsic Calibration）。在每帧图像中提取车道线或地面特征，用这些特征和上一帧的外参联合优化 R 和 T。

// 在线外参优化（简化为单参数偏移）
// 假设外参只有平移偏移：T_real = T_initial + ΔT
// ΔT 通过最小化重投影误差得到

__aicore__ void OnlineExtrinsicCalib(
    GlobalTensor<float>& delta_T,         // 外参平移偏移 [3]
    GlobalTensor<float>& lane_3d,        // 检测到的 3D 车道线点
    GlobalTensor<float>& lane_2d_proj,  // 车道线在图像上的投影
    GlobalTensor<float>& camera_K,       // 内参
    GlobalTensor<float>& camera_R,      // 外参旋转（已知固定）
    int num_lanes
) {
    // 重投影误差：车道线 3D 点投影到图像，和检测到的车道线像素的差异
    // e_i = ||project(lane_3d[i], K, R, T_initial + ΔT) - lane_2d_proj[i]||²

    // 高斯-牛顿迭代优化 ΔT
    for (int iter = 0; iter < 10; iter++) {
        float2 total_error = {0, 0};

        for (int i = 0; i < num_lanes; i++) {
            float3 p_3d = lane_3d[i];
            float2 p_proj;

            // 用当前 ΔT 做投影
            float3 T_curr = T_initial + delta_T;
            ProjectToImage(p_3d, camera_K, camera_R, T_curr, p_proj);

            // 误差
            float2 e = p_proj - lane_2d_proj[i];
            total_error += e;
        }

        // 更新 ΔT（梯度下降方向）
        // 简化：ΔT_new = ΔT - lr × total_error / num_lanes
        float lr = 0.01f;
        delta_T -= lr * total_error / float(num_lanes);

        // 收敛判断
        if (Length(total_error) < 0.1f) break;
    }
}

在线标定的收敛速度快（3-5 次迭代），但会消耗一部分计算资源。在资源紧张时可以每 N 帧做一次标定，而不是每帧都做。

空间智能三条线——3D 检测、NeRF、SLAM——都是自动驾驶和机器人感知的基础能力。三个场景在算子层面的需求各不相同：BEVFormer 依赖高效矩阵乘和视角变换，NeRF 依赖体渲染（Ray Marching），SLAM 依赖光流和特征匹配。但它们的共同点是实时性要求高——自动驾驶的前视感知需要在 50ms 内完成，SLAM 需要在 30fps 下跑通。cann-recipes-spatial-intelligence 把这三类场景的优化路径都整理成菜谱。