// // KeypointsProcessor.swift // PickleBallUI // // Created by Michael Zhang on 1/20/25. // import CoreML import SwiftUI import Vision class KeypointsProcessor: ObservableObject { @Published var confidenceThreshold: Float = 0.3 @Published var iouThreshold: Float = 0.6 @Published var maskThreshold: Float = 0.5 @MainActor @Published var predictions: [Prediction] = [] @Published var maskPredictions: [MaskPrediction] = [] @Published var combinedMaskImage: UIImage? private var isProcessing = false @Published var currentMask: CGImage? // ADD LOGIC TO DISPLAY MASK ON A PREVIEW LAYER OF THE CAMERA func processFrame(_ pixelBuffer: CVPixelBuffer, height: Int, width: Int) async { guard !isProcessing else { return } isProcessing = true defer { isProcessing = false } print("Processing frame with dimensions: \(width) x \(height)") let mask = await runCoreMLInference(pixelBuffer: pixelBuffer, width: width, height: height) if let mask = mask { print("Got mask with dimensions: \(mask.maskSize)") if let cgImage = mask.toCGImage() { print("Successfully created CGImage with dimensions: \(cgImage.width) x \(cgImage.height)") // let uiImage = UIImage(cgImage: cgImage) // UIImageWriteToSavedPhotosAlbum(uiImage, nil, nil, nil) // print("SAVED") await MainActor.run { self.currentMask = cgImage print("Updated currentMask on main thread") } } else { print("Failed to create CGImage from mask") } } else { print("No mask generated") } } private func runCoreMLInference(pixelBuffer: CVPixelBuffer, width: Int, height: Int) async -> MaskPrediction? { let config = MLModelConfiguration() guard let model = try? KeypointDetectorModel(configuration: config) else { NSLog("Failed to init model") return nil } do { let outputs = try model.prediction(input_image: pixelBuffer) let boxesOutput = outputs.var_1052 let masksOutput = outputs.var_742 let numSegmentationMasks = 32 let numClasses = Int(truncating: boxesOutput.shape[1]) - 4 - numSegmentationMasks var predictions = getPredictionsFromOutput(output: boxesOutput, rows: Int(truncating: boxesOutput.shape[1]), columns: Int(truncating: boxesOutput.shape[2]), numberOfClasses: numClasses, inputImgSize: CGSize(width: CVPixelBufferGetWidth(pixelBuffer), height: CVPixelBufferGetHeight(pixelBuffer))) predictions.removeAll { $0.score < confidenceThreshold } guard !predictions.isEmpty else { return nil } let groupedPredictions = Dictionary(grouping: predictions) { prediction in prediction.classIndex } var nmsPredictions: [Prediction] = [] let _ = groupedPredictions.mapValues { predictions in nmsPredictions.append( contentsOf: nonMaximumSuppression(predictions: predictions, iouThreshold: iouThreshold, limit: 100) ) } NSLog("\(nmsPredictions.count) boxes left after performing nms with iou threshold of 0.6") guard !nmsPredictions.isEmpty else { return nil } await MainActor.run { [weak self, nmsPredictions] in self?.predictions = nmsPredictions } let maskProtos = getMaskProtosFromOutput(output: masksOutput, rows: Int(truncating: masksOutput.shape[3]), columns: Int(truncating: masksOutput.shape[2]), tubes: Int(truncating: masksOutput.shape[1])) // CHANGE THIS, pixelBuffer is not right to get width, pass OG pixel buffer dimensions to use here let maskPredictions = masksFromProtos(boxPredictions: nmsPredictions, maskProtos: maskProtos, maskSize: (width: Int(truncating: masksOutput.shape[3]), height: Int(truncating: masksOutput.shape[2])), originalImgSize: CGSize(width: width, height: height)) await MainActor.run { [weak self, maskPredictions] in self?.maskPredictions = maskPredictions } return maskPredictions.first } catch { NSLog("Error in CoreML inference: \(error)") return nil } } } extension KeypointsProcessor { func getPredictionsFromOutput( output: MLMultiArray, rows: Int, columns: Int, numberOfClasses: Int, inputImgSize: CGSize ) -> [Prediction] { guard output.count != 0 else { return [] } var predictions = [Prediction]() for i in 0.. heighestScore { heighestScore = score classIndex = j } } return (classIndex, heighestScore) }() let maskCoefficients = { var coefficients: [Float] = [] for k in 0..<32 { coefficients.append(Float(truncating: output[(4+numberOfClasses+k)*columns+i])) } return coefficients }() // Convert box from xywh to xyxy let left = centerX - width/2 let top = centerY - height/2 let right = centerX + width/2 let bottom = centerY + height/2 let prediction = Prediction( classIndex: classIndex, score: score, xyxy: (left, top, right, bottom), maskCoefficients: maskCoefficients, inputImgSize: inputImgSize ) predictions.append(prediction) } return predictions } func nonMaximumSuppression( predictions: [Prediction], iouThreshold: Float, limit: Int ) -> [Prediction] { guard !predictions.isEmpty else { return [] } let sortedIndices = predictions.indices.sorted { predictions[$0].score > predictions[$1].score } var selected: [Prediction] = [] var active = [Bool](repeating: true, count: predictions.count) var numActive = active.count outer: for i in 0..= limit { break } for j in i+1.. iouThreshold { active[j] = false numActive -= 1 if numActive <= 0 { break outer } } } } } } return selected } private func IOU(a: XYXY, b: XYXY) -> Float { // Calculate the intersection coordinates let x1 = max(a.x1, b.x1) let y1 = max(a.y1, b.y1) let x2 = max(a.x2, b.x2) let y2 = max(a.y1, b.y2) // Calculate the intersection area let intersection = max(x2 - x1, 0) * max(y2 - y1, 0) // Calculate the union area let area1 = (a.x2 - a.x1) * (a.y2 - a.y1) let area2 = (b.x2 - b.x1) * (b.y2 - b.y1) let union = area1 + area2 - intersection // Calculate the IoU score let iou = intersection / union return iou } func getMaskProtosFromOutput( output: MLMultiArray, rows: Int, columns: Int, tubes: Int ) -> [[UInt8]] { var masks: [[UInt8]] = [] for tube in 0.. [MaskPrediction] { NSLog("Generate masks from prototypes") var maskPredictions: [MaskPrediction] = [] for prediction in boxPredictions { let maskCoefficients = prediction.maskCoefficients var finalMask: [Float] = [] for (index, maskProto) in maskProtos.enumerated() { let weight = maskCoefficients[index] finalMask = finalMask.add(maskProto.map { Float($0) * weight }) } NSLog("Apply sigmoid") finalMask = finalMask.map { sigmoid(value: $0) } NSLog("Crop mask to bounding box") let croppedMask = crop( mask: finalMask, maskSize: maskSize, box: prediction.xyxy) let scale = min( max( Int(originalImgSize.width) / maskSize.width, Int(originalImgSize.height) / maskSize.height), 6) let targetSize = ( width: maskSize.width * scale, height: maskSize.height * scale) NSLog("Upsample mask with size \(maskSize) to \(targetSize)") let upsampledMask = croppedMask .map { Float(($0 > maskThreshold ? 1 : 0)) } .upsample( initialSize: maskSize, scale: scale) .map { UInt8(($0 > maskThreshold ? 1 : 0) * 255) } maskPredictions.append( MaskPrediction( classIndex: prediction.classIndex, mask: upsampledMask, maskSize: targetSize)) } return maskPredictions } func sigmoid(value: Float) -> Float { return 1.0 / (1.0 + exp(-value)) } private func crop( mask: [Float], maskSize: (width: Int, height: Int), box: XYXY ) -> [Float] { let rows = maskSize.height let columns = maskSize.width let x1 = Int(box.x1 / 4) let y1 = Int(box.y1 / 4) let x2 = Int(box.x2 / 4) let y2 = Int(box.y2 / 4) var croppedArr: [Float] = [] for row in 0..= x1 && column <= x2 && row >= y1 && row <= y2 { croppedArr.append(mask[row*columns+column]) } else { croppedArr.append(0) } } } return croppedArr } }