From 1401b8237bd4845c5ac421995dbfc7e501104f67 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Thu, 17 Oct 2024 08:16:12 -0700 Subject: [PATCH 01/16] integrate geo model, first steps --- .../VisionCameraPluginInatVision.m | 432 +++++++++++++----- 1 file changed, 309 insertions(+), 123 deletions(-) diff --git a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m index 158432f..902593f 100644 --- a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m +++ b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m @@ -6,157 +6,343 @@ @import UIKit; @import Vision; @import CoreML; +@import Accelerate; #import "VCPTaxonomy.h" #import "VCPPrediction.h" +@class VCPGeoModel; + @interface VisionCameraPluginInatVisionPlugin : FrameProcessorPlugin -+ (VCPTaxonomy*) taxonomyWithTaxonomyFile:(NSString*)taxonomyPath; -+ (VNCoreMLModel*) visionModelWithModelFile:(NSString*)modelPath; ++ (VCPTaxonomy *) taxonomyWithTaxonomyFile:(NSString *)taxonomyPath; ++ (VNCoreMLModel *)visionModelWithModelFile:(NSString *)modelPath; ++ (VCPGeoModel *)geoModelWithModelFile:(NSString *)geoModelPath; + +@end + +@interface VCPGeoModel: NSObject + +- (instancetype)initWithModelPath:(NSString *)modelPath; +- (MLMultiArray *)predictionsForLat:(float)latitude lng:(float)longitude elevation:(float)elevation; + +@property MLModel *geoModel; @end + @implementation VisionCameraPluginInatVisionPlugin -+ (VCPTaxonomy*) taxonomyWithTaxonomyFile:(NSString*)taxonomyPath { - static VCPTaxonomy* taxonomy = nil; - if (taxonomy == nil) { - taxonomy = [[VCPTaxonomy alloc] initWithTaxonomyFile:taxonomyPath]; - } - return taxonomy; ++ (VCPTaxonomy *)taxonomyWithTaxonomyFile:(NSString *)taxonomyPath { + static VCPTaxonomy *taxonomy = nil; + if (taxonomy == nil) { + taxonomy = [[VCPTaxonomy alloc] initWithTaxonomyFile:taxonomyPath]; + } + return taxonomy; } -+ (VNCoreMLModel*) visionModelWithModelFile:(NSString*)modelPath { - static VNCoreMLModel* visionModel = nil; - if (visionModel == nil) { - // Setup vision - NSURL *modelUrl = [NSURL fileURLWithPath:modelPath]; - if (!modelUrl) { - // TODO: handle this error - // [self.delegate classifierError:@"no file for optimized model"]; - NSLog(@"no file for optimized model"); - return nil; ++ (VCPGeoModel *)geoModelWithModelFile:(NSString *)modelPath { + static VCPGeoModel *geoModel = nil; + + if (geoModel == nil) { + geoModel = [[VCPGeoModel alloc] initWithModelPath:modelPath]; } + + return geoModel; +} - NSError *loadError = nil; - MLModel *model = [MLModel modelWithContentsOfURL:modelUrl - error:&loadError]; - if (loadError) { - NSString *errString = [NSString stringWithFormat:@"error loading model: %@", - loadError.localizedDescription]; - NSLog(@"%@", errString); - // TODO: handle this error - // [self.delegate classifierError:errString]; - return nil; - } - if (!model) { - // TODO: handle this error - // [self.delegate classifierError:@"unable to make model"]; - NSLog(@"unable to make model"); - return nil; ++ (VNCoreMLModel *)visionModelWithModelFile:(NSString *)modelPath { + static VNCoreMLModel *visionModel = nil; + if (visionModel == nil) { + // Setup vision + //NSURL *modelUrl = [NSURL fileURLWithPath:modelPath]; + NSURL *modelUrl = [[NSBundle mainBundle] URLForResource:@"cvmodel" withExtension:@"mlmodelc"]; + if (!modelUrl) { + // TODO: handle this error + // [self.delegate classifierError:@"no file for optimized model"]; + NSLog(@"no file for optimized model"); + return nil; + } + + NSError *loadError = nil; + MLModel *model = [MLModel modelWithContentsOfURL:modelUrl + error:&loadError]; + if (loadError) { + NSString *errString = [NSString stringWithFormat:@"error loading model: %@", + loadError.localizedDescription]; + NSLog(@"vision model mlmodel load error: %@", errString); + // TODO: handle this error + // [self.delegate classifierError:errString]; + return nil; + } else { + NSLog(@"no error produced while loading vision model"); + } + + if (!model) { + // TODO: handle this error + // [self.delegate classifierError:@"unable to make model"]; + NSLog(@"unable to make vision mlmodel"); + return nil; + } + + NSError *modelError = nil; + visionModel = [VNCoreMLModel modelForMLModel:model + error:&modelError]; + if (modelError) { + NSString *errString = [NSString stringWithFormat:@"error making vision model: %@", + modelError.localizedDescription]; + // [self.delegate classifierError:errString]; + NSLog(@"vision model vncoreml load error %@", errString); + return nil; + } + if (!visionModel) { + // [self.delegate classifierError:@"unable to make vision model"]; + NSLog(@"unable to make vision model vncoreml"); + return nil; + } } + + return visionModel; +} - NSError *modelError = nil; - visionModel = [VNCoreMLModel modelForMLModel:model - error:&modelError]; - if (modelError) { - NSString *errString = [NSString stringWithFormat:@"error making vision model: %@", - modelError.localizedDescription]; - // [self.delegate classifierError:errString]; - NSLog(@"%@", errString); +- (instancetype)initWithProxy:(VisionCameraProxyHolder*)proxy + withOptions:(NSDictionary* _Nullable)options { + self = [super initWithProxy:proxy withOptions:options]; + return self; +} + +- (MLMultiArray *)combineVisionScores:(MLMultiArray *)visionScores with:(MLMultiArray *)geoScores error:(NSError **)error { + // Ensure both arrays have the same shape + if (![visionScores.shape isEqualToArray:geoScores.shape]) { + if (error) { + *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" + code:1 + userInfo:@{NSLocalizedDescriptionKey: @"Arrays must have the same shape"}]; + } return nil; } - if (!visionModel) { - // [self.delegate classifierError:@"unable to make vision model"]; - NSLog(@"unable to make vision model"); + + // Create a result MLMultiArray with the same shape as the input arrays + MLMultiArray *combinedArray = [[MLMultiArray alloc] initWithShape:visionScores.shape + dataType:MLMultiArrayDataTypeDouble + error:error]; + if (!combinedArray) { return nil; } - } - return visionModel; + + // Get the data pointers + double *visionData = (double *)visionScores.dataPointer; + double *geoData = (double *)geoScores.dataPointer; + double *combinedData = (double *)combinedArray.dataPointer; + + // Get the number of elements + NSInteger count = visionScores.count; + + // Perform element-wise multiplication using vDSP_vmul + vDSP_vmulD(visionData, 1, geoData, 1, combinedData, 1, count); + + return combinedArray; } -- (instancetype)initWithProxy:(VisionCameraProxyHolder*)proxy - withOptions:(NSDictionary* _Nullable)options { - self = [super initWithProxy:proxy withOptions:options]; - return self; +- (MLMultiArray *)normalizeMultiArray:(MLMultiArray *)mlArray error:(NSError **)error { + NSInteger count = mlArray.count; + double *mlData = (double *)mlArray.dataPointer; + + double sum = 0.0; + vDSP_sveD(mlData, 1, &sum, count); + + // Normalize by dividing each element by the sum + if (sum != 0) { + vDSP_vsdivD(mlData, 1, &sum, mlData, 1, count); + } else { + if (error) { + *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" + code:2 + userInfo:@{NSLocalizedDescriptionKey: @"Sum of elements is zero, normalization not possible."}]; + } + return nil; + } + + return mlArray; } - (id)callback:(Frame*)frame withArguments:(NSDictionary*)arguments { - // Start timestamp - NSDate *startDate = [NSDate date]; - - // Log arguments - NSLog(@"inatVision arguments: %@", arguments); - // Destructure version out of options - NSString* version = arguments[@"version"]; - // Destructure model path out of options - NSString* modelPath = arguments[@"modelPath"]; - // Destructure taxonomy path out of options - NSString* taxonomyPath = arguments[@"taxonomyPath"]; - - CMSampleBufferRef buffer = frame.buffer; - UIImageOrientation orientation = frame.orientation; - - CVImageBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(buffer); - if (!pixelBuffer) { - NSLog(@"unable to get pixel buffer"); - return nil; - } - - // Setup taxonomy - VCPTaxonomy *taxonomy = [VisionCameraPluginInatVisionPlugin taxonomyWithTaxonomyFile:taxonomyPath]; - - // Setup vision model - VNCoreMLModel *visionModel = [VisionCameraPluginInatVisionPlugin visionModelWithModelFile:modelPath]; - - // Setup top branches - NSMutableArray *topBranches = [NSMutableArray array]; - VNRequestCompletionHandler recognitionHandler = ^(VNRequest * _Nonnull request, NSError * _Nullable error) { - VNCoreMLFeatureValueObservation *firstResult = request.results.firstObject; - MLFeatureValue *firstFV = firstResult.featureValue; - MLMultiArray *mm = firstFV.multiArrayValue; - - NSArray *bestBranch = [taxonomy inflateTopBranchFromClassification:mm]; - // add this to the end of the recent top branches array - [topBranches addObject:bestBranch]; - }; - - VNCoreMLRequest *objectRecognition = [[VNCoreMLRequest alloc] initWithModel:visionModel - completionHandler:recognitionHandler]; - objectRecognition.imageCropAndScaleOption = VNImageCropAndScaleOptionCenterCrop; - NSArray *requests = @[objectRecognition]; - - VNImageRequestHandler *handler = [[VNImageRequestHandler alloc] initWithCVPixelBuffer:pixelBuffer - orientation:orientation - options:@{}]; - NSError *requestError = nil; - [handler performRequests:requests - error:&requestError]; - if (requestError) { - NSString *errString = [NSString stringWithFormat:@"got a request error: %@", - requestError.localizedDescription]; - NSLog(@"%@", errString); - return nil; - } - - // convert the VCPPredictions in the bestRecentBranch into dicts - NSMutableArray *bestBranchAsDict = [NSMutableArray array]; - for (VCPPrediction *prediction in topBranches.firstObject) { - [bestBranchAsDict addObject:[prediction asDict]]; - } - - // Create a new dictionary with the bestBranchAsDict under the key "predictions" - NSDictionary *response = [NSDictionary dictionary]; - response = @{@"predictions": bestBranchAsDict}; - - // End timestamp - NSTimeInterval timeElapsed = [[NSDate date] timeIntervalSinceDate:startDate]; - NSLog(@"inatVision took %f seconds", timeElapsed); - - return response; + // Start timestamp + NSDate *startDate = [NSDate date]; + + MLMultiArray *geoModelPreds = nil; + if ([arguments objectForKey:@"latitude"] + && [arguments objectForKey:@"longitude"] + && [arguments objectForKey:@"elevation"] + && [arguments objectForKey:@"geoModelPath"]) + { + NSString *geoModelPath = arguments[@"geoModelPath"]; + VCPGeoModel *geoModel = [VisionCameraPluginInatVisionPlugin geoModelWithModelFile:geoModelPath]; + geoModelPreds = [geoModel predictionsForLat:[[arguments objectForKey:@"latitude"] floatValue] + lng:[[arguments objectForKey:@"longitude"] floatValue] + elevation:[[arguments objectForKey:@"elevation"] floatValue]]; + } else { + NSLog(@"not doing anything geo related."); + } + + NSLog(@"got %ld geo model scores", geoModelPreds.count); + + // Log arguments + NSLog(@"inatVision arguments: %@", arguments); + // Destructure version out of options + NSString* version = arguments[@"version"]; + // Destructure model path out of options + NSString* modelPath = arguments[@"modelPath"]; + // Destructure taxonomy path out of options + NSString* taxonomyPath = arguments[@"taxonomyPath"]; + + CMSampleBufferRef buffer = frame.buffer; + UIImageOrientation orientation = frame.orientation; + + CVImageBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(buffer); + if (!pixelBuffer) { + NSLog(@"unable to get pixel buffer"); + return nil; + } + + // Setup taxonomy + VCPTaxonomy *taxonomy = [VisionCameraPluginInatVisionPlugin taxonomyWithTaxonomyFile:taxonomyPath]; + + // Setup vision model + VNCoreMLModel *visionModel = [VisionCameraPluginInatVisionPlugin visionModelWithModelFile:modelPath]; + + // Setup top branches + NSMutableArray *topBranches = [NSMutableArray array]; + VNRequestCompletionHandler recognitionHandler = ^(VNRequest * _Nonnull request, NSError * _Nullable error) { + VNCoreMLFeatureValueObservation *firstResult = request.results.firstObject; + MLFeatureValue *firstFV = firstResult.featureValue; + MLMultiArray *visionScores = firstFV.multiArrayValue; + + MLMultiArray *mm = nil; + if (geoModelPreds != nil) { + NSError *err = nil; + mm = [self combineVisionScores:visionScores with:geoModelPreds error:&err]; + mm = [self normalizeMultiArray:mm error:&err]; + } else { + mm = visionScores; + } + + NSArray *bestBranch = [taxonomy inflateTopBranchFromClassification:mm]; + // add this to the end of the recent top branches array + [topBranches addObject:bestBranch]; + }; + + VNCoreMLRequest *objectRecognition = [[VNCoreMLRequest alloc] initWithModel:visionModel + completionHandler:recognitionHandler]; + objectRecognition.imageCropAndScaleOption = VNImageCropAndScaleOptionCenterCrop; + NSArray *requests = @[objectRecognition]; + + VNImageRequestHandler *handler = [[VNImageRequestHandler alloc] initWithCVPixelBuffer:pixelBuffer + orientation:orientation + options:@{}]; + NSError *requestError = nil; + [handler performRequests:requests + error:&requestError]; + if (requestError) { + NSString *errString = [NSString stringWithFormat:@"got a request error: %@", + requestError.localizedDescription]; + NSLog(@"%@", errString); + return nil; + } + + // convert the VCPPredictions in the bestRecentBranch into dicts + NSMutableArray *bestBranchAsDict = [NSMutableArray array]; + for (VCPPrediction *prediction in topBranches.firstObject) { + [bestBranchAsDict addObject:[prediction asDict]]; + } + + // Create a new dictionary with the bestBranchAsDict under the key "predictions" + NSDictionary *response = [NSDictionary dictionary]; + response = @{@"predictions": bestBranchAsDict}; + + // End timestamp + NSTimeInterval timeElapsed = [[NSDate date] timeIntervalSinceDate:startDate]; + NSLog(@"inatVision took %f seconds", timeElapsed); + + return response; } VISION_EXPORT_FRAME_PROCESSOR(VisionCameraPluginInatVisionPlugin, inatVision) @end + + + + +@implementation VCPGeoModel + +- (instancetype _Nullable)initWithModelPath:(NSString *)modelPath { + if (self = [super init]) { + NSURL *geoModelUrl = [NSURL fileURLWithPath:modelPath]; + if (!geoModelUrl) { + NSLog(@"no file for geo model"); + return nil; + } + + NSError *loadError = nil; + self.geoModel = [MLModel modelWithContentsOfURL:geoModelUrl error:&loadError]; + if (loadError) { + NSString *errString = [NSString stringWithFormat:@"error loading model: %@", + loadError.localizedDescription]; + NSLog(@"%@", errString); + return nil; + } + if (!self.geoModel) { + NSLog(@"unable to make geo model"); + return nil; + } + } + + return self; +} + +- (NSArray *)normAndEncodeLat:(float)latitude lng:(float)longitude elevation:(float)elevation { + float normLat = latitude / 90.0; + float normLng = longitude / 180.0; + float normElev = 0.0; + if (elevation > 0) { + normElev = elevation / 5705.63; + } else { + normElev = elevation / 32768.0; + } + float a = sin(M_PI * normLng); + float b = sin(M_PI * normLat); + float c = cos(M_PI * normLng); + float d = cos(M_PI * normLat); + + return @[ @(a), @(b), @(c), @(d), @(normElev) ]; +} + +- (MLMultiArray *)predictionsForLat:(float)latitude lng:(float)longitude elevation:(float)elevation { + NSArray *geoModelInputs = [self normAndEncodeLat:latitude + lng:longitude + elevation:elevation]; + + NSError *err = nil; + MLMultiArray *mlInputs = [[MLMultiArray alloc] initWithShape:@[@1, @5] + dataType:MLMultiArrayDataTypeDouble + error:&err]; + for (int i = 0; i < 5; i++) { + mlInputs[i] = geoModelInputs[i]; + } + MLFeatureValue *fv = [MLFeatureValue featureValueWithMultiArray:mlInputs]; + + NSError *fpError = nil; + NSDictionary *fpDict = @{ @"input_1": fv }; + MLDictionaryFeatureProvider *fp = [[MLDictionaryFeatureProvider alloc] initWithDictionary:fpDict + error:&fpError]; + + NSError *predError = nil; + id results = [self.geoModel predictionFromFeatures:fp error:&predError]; + MLFeatureValue *result = [results featureValueForName:@"Identity"]; + MLMultiArray *geoModelScores = result.multiArrayValue; + + return geoModelScores; +} + +@end From ee817b086bd616cebfc8682fc4548228f7968dab Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 11:51:16 -0700 Subject: [PATCH 02/16] refactor vision component into its own class --- example/src/App.tsx | 2 +- ios/Classifier/VCPVisionModel.h | 31 +++++ ios/Classifier/VCPVisionModel.m | 82 ++++++++++++ .../project.pbxproj | 6 + .../VisionCameraPluginInatVision.m | 123 ++++-------------- 5 files changed, 147 insertions(+), 97 deletions(-) create mode 100644 ios/Classifier/VCPVisionModel.h create mode 100644 ios/Classifier/VCPVisionModel.m diff --git a/example/src/App.tsx b/example/src/App.tsx index a00978c..e516b74 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -296,7 +296,7 @@ export default function App(): React.JSX.Element { isActive={true} frameProcessor={frameProcessor} enableZoomGesture - pixelFormat={'yuv'} + pixelFormat={'rgb'} resizeMode="contain" enableFpsGraph={true} photoQualityBalance="quality" diff --git a/ios/Classifier/VCPVisionModel.h b/ios/Classifier/VCPVisionModel.h new file mode 100644 index 0000000..968940b --- /dev/null +++ b/ios/Classifier/VCPVisionModel.h @@ -0,0 +1,31 @@ +// +// VCPVisionModel.h +// VisionCameraPluginInatVision +// +// Created by Alex Shepard on 10/18/24. +// Copyright © 2024 Facebook. All rights reserved. +// + +@import CoreML; +@import Vision; + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface VCPVisionModel : NSObject + +- (instancetype _Nullable)initWithModelPath:(NSString *)modelPath; +- (MLMultiArray * _Nullable)visionPredictionsFor:(CVPixelBufferRef)pixBuf orientation:(UIImageOrientation)orient; + +@property MLModel *cvModel; +@property VNCoreMLModel *visionModel; + +@property VNCoreMLRequest *classification; +@property NSArray *requests; + +@property MLMultiArray *recentVisionScores; + +@end + +NS_ASSUME_NONNULL_END diff --git a/ios/Classifier/VCPVisionModel.m b/ios/Classifier/VCPVisionModel.m new file mode 100644 index 0000000..b07d0cc --- /dev/null +++ b/ios/Classifier/VCPVisionModel.m @@ -0,0 +1,82 @@ +// +// VCPVisionModel.m +// VisionCameraPluginInatVision +// +// Created by Alex Shepard on 10/18/24. +// Copyright © 2024 Facebook. All rights reserved. +// + +#import "VCPVisionModel.h" + +@implementation VCPVisionModel + +- (instancetype _Nullable)initWithModelPath:(NSString *)modelPath { + if (self = [super init]) { + NSURL *visionModelUrl = [NSURL fileURLWithPath:modelPath]; + if (!visionModelUrl) { + NSLog(@"no file for vision model"); + return nil; + } + + NSError *loadError = nil; + self.cvModel = [MLModel modelWithContentsOfURL:visionModelUrl error:&loadError]; + if (loadError) { + NSString *errString = [NSString stringWithFormat:@"error loading cv model: %@", + loadError.localizedDescription]; + NSLog(@"%@", errString); + return nil; + } + if (!self.cvModel) { + NSLog(@"unable to make cv model"); + return nil; + } + + NSError *modelError = nil; + self.visionModel = [VNCoreMLModel modelForMLModel:self.cvModel + error:&modelError]; + + __weak typeof(self) weakSelf = self; + VNRequestCompletionHandler recognitionHandler = ^(VNRequest * _Nonnull request, NSError * _Nullable error) { + + VNCoreMLFeatureValueObservation *firstResult = request.results.firstObject; + MLFeatureValue *firstFV = firstResult.featureValue; + weakSelf.recentVisionScores = firstFV.multiArrayValue; + }; + + self.classification = [[VNCoreMLRequest alloc] initWithModel:self.visionModel + completionHandler:recognitionHandler]; + self.classification.imageCropAndScaleOption = VNImageCropAndScaleOptionCenterCrop; + self.requests = @[ self.classification ]; + } + + return self; +} + +- (MLMultiArray * _Nullable)visionPredictionsFor:(CVPixelBufferRef)pixBuf orientation:(UIImageOrientation)orient { + CGImagePropertyOrientation cgOrient = [self cgOrientationFor:orient]; + VNImageRequestHandler *handler = [[VNImageRequestHandler alloc] initWithCVPixelBuffer:pixBuf + orientation:cgOrient + options:@{}]; + + NSError *requestError = nil; + [handler performRequests:self.requests + error:&requestError]; + + return self.recentVisionScores; +} + +- (CGImagePropertyOrientation)cgOrientationFor:(UIImageOrientation)uiOrientation { + switch (uiOrientation) { + case UIImageOrientationUp: return kCGImagePropertyOrientationUp; + case UIImageOrientationDown: return kCGImagePropertyOrientationDown; + case UIImageOrientationLeft: return kCGImagePropertyOrientationLeft; + case UIImageOrientationRight: return kCGImagePropertyOrientationRight; + case UIImageOrientationUpMirrored: return kCGImagePropertyOrientationUpMirrored; + case UIImageOrientationDownMirrored: return kCGImagePropertyOrientationDownMirrored; + case UIImageOrientationLeftMirrored: return kCGImagePropertyOrientationLeftMirrored; + case UIImageOrientationRightMirrored: return kCGImagePropertyOrientationRightMirrored; + } +} + +@end + diff --git a/ios/VisionCameraPluginInatVision.xcodeproj/project.pbxproj b/ios/VisionCameraPluginInatVision.xcodeproj/project.pbxproj index 29b649e..976ea7f 100644 --- a/ios/VisionCameraPluginInatVision.xcodeproj/project.pbxproj +++ b/ios/VisionCameraPluginInatVision.xcodeproj/project.pbxproj @@ -13,6 +13,7 @@ 8F3C41832A4AFF5B008FBC67 /* VCPClassifier.m in Sources */ = {isa = PBXBuildFile; fileRef = 8F3C417C2A4AFF5B008FBC67 /* VCPClassifier.m */; }; 8F3C41842A4AFF5B008FBC67 /* VCPTaxonomy.m in Sources */ = {isa = PBXBuildFile; fileRef = 8F3C41802A4AFF5B008FBC67 /* VCPTaxonomy.m */; }; 8F3C41852A4AFF5B008FBC67 /* VCPPrediction.m in Sources */ = {isa = PBXBuildFile; fileRef = 8F3C41812A4AFF5B008FBC67 /* VCPPrediction.m */; }; + FA5FF9642CC3182D00BA8E22 /* VCPVisionModel.m in Sources */ = {isa = PBXBuildFile; fileRef = FA5FF9632CC3182D00BA8E22 /* VCPVisionModel.m */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -39,6 +40,8 @@ 8F3C417F2A4AFF5B008FBC67 /* VCPTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VCPTaxonomy.h; sourceTree = ""; }; 8F3C41802A4AFF5B008FBC67 /* VCPTaxonomy.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = VCPTaxonomy.m; sourceTree = ""; }; 8F3C41812A4AFF5B008FBC67 /* VCPPrediction.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = VCPPrediction.m; sourceTree = ""; }; + FA5FF9622CC3182D00BA8E22 /* VCPVisionModel.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = VCPVisionModel.h; sourceTree = ""; }; + FA5FF9632CC3182D00BA8E22 /* VCPVisionModel.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = VCPVisionModel.m; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -89,6 +92,8 @@ 8F3C41812A4AFF5B008FBC67 /* VCPPrediction.m */, 8F3C417F2A4AFF5B008FBC67 /* VCPTaxonomy.h */, 8F3C41802A4AFF5B008FBC67 /* VCPTaxonomy.m */, + FA5FF9622CC3182D00BA8E22 /* VCPVisionModel.h */, + FA5FF9632CC3182D00BA8E22 /* VCPVisionModel.m */, ); path = Classifier; sourceTree = ""; @@ -154,6 +159,7 @@ 8F3C41832A4AFF5B008FBC67 /* VCPClassifier.m in Sources */, 1DB58401D995067FF278746C /* VisionCameraPluginInatVision.m in Sources */, 8F3C41842A4AFF5B008FBC67 /* VCPTaxonomy.m in Sources */, + FA5FF9642CC3182D00BA8E22 /* VCPVisionModel.m in Sources */, 8F3C41822A4AFF5B008FBC67 /* VCPNode.m in Sources */, 8F2D62CB2B1B9DD500412573 /* VisionCameraPluginInatVisionModule.m in Sources */, ); diff --git a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m index 902593f..bcf9371 100644 --- a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m +++ b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m @@ -7,17 +7,19 @@ @import Vision; @import CoreML; @import Accelerate; +@import CoreGraphics; #import "VCPTaxonomy.h" #import "VCPPrediction.h" @class VCPGeoModel; +#import "VCPVisionModel.h" @interface VisionCameraPluginInatVisionPlugin : FrameProcessorPlugin + (VCPTaxonomy *) taxonomyWithTaxonomyFile:(NSString *)taxonomyPath; -+ (VNCoreMLModel *)visionModelWithModelFile:(NSString *)modelPath; + (VCPGeoModel *)geoModelWithModelFile:(NSString *)geoModelPath; ++ (VCPVisionModel *)visionModelWithModelFile:(NSString *)modelPath; @end @@ -51,58 +53,14 @@ + (VCPGeoModel *)geoModelWithModelFile:(NSString *)modelPath { return geoModel; } -+ (VNCoreMLModel *)visionModelWithModelFile:(NSString *)modelPath { - static VNCoreMLModel *visionModel = nil; - if (visionModel == nil) { - // Setup vision - //NSURL *modelUrl = [NSURL fileURLWithPath:modelPath]; - NSURL *modelUrl = [[NSBundle mainBundle] URLForResource:@"cvmodel" withExtension:@"mlmodelc"]; - if (!modelUrl) { - // TODO: handle this error - // [self.delegate classifierError:@"no file for optimized model"]; - NSLog(@"no file for optimized model"); - return nil; - } - - NSError *loadError = nil; - MLModel *model = [MLModel modelWithContentsOfURL:modelUrl - error:&loadError]; - if (loadError) { - NSString *errString = [NSString stringWithFormat:@"error loading model: %@", - loadError.localizedDescription]; - NSLog(@"vision model mlmodel load error: %@", errString); - // TODO: handle this error - // [self.delegate classifierError:errString]; - return nil; - } else { - NSLog(@"no error produced while loading vision model"); - } - - if (!model) { - // TODO: handle this error - // [self.delegate classifierError:@"unable to make model"]; - NSLog(@"unable to make vision mlmodel"); - return nil; - } - - NSError *modelError = nil; - visionModel = [VNCoreMLModel modelForMLModel:model - error:&modelError]; - if (modelError) { - NSString *errString = [NSString stringWithFormat:@"error making vision model: %@", - modelError.localizedDescription]; - // [self.delegate classifierError:errString]; - NSLog(@"vision model vncoreml load error %@", errString); - return nil; - } - if (!visionModel) { - // [self.delegate classifierError:@"unable to make vision model"]; - NSLog(@"unable to make vision model vncoreml"); - return nil; - } ++ (VCPVisionModel *)visionModelWithModelFile:(NSString *)modelPath { + static VCPVisionModel *cvModel = nil; + + if (cvModel == nil) { + cvModel = [[VCPVisionModel alloc] initWithModelPath:modelPath]; } - return visionModel; + return cvModel; } - (instancetype)initWithProxy:(VisionCameraProxyHolder*)proxy @@ -197,58 +155,31 @@ - (id)callback:(Frame*)frame withArguments:(NSDictionary*)arguments { NSString* taxonomyPath = arguments[@"taxonomyPath"]; CMSampleBufferRef buffer = frame.buffer; + CVImageBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(buffer); UIImageOrientation orientation = frame.orientation; - CVImageBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(buffer); - if (!pixelBuffer) { - NSLog(@"unable to get pixel buffer"); - return nil; + VCPVisionModel *cvModel = [VisionCameraPluginInatVisionPlugin visionModelWithModelFile:modelPath]; + MLMultiArray *visionScores = [cvModel visionPredictionsFor:pixelBuffer orientation:orientation]; + + MLMultiArray *results = nil; + + + if (geoModelPreds != nil) { + NSError *err = nil; + results = [self combineVisionScores:visionScores with:geoModelPreds error:&err]; + results = [self normalizeMultiArray:results error:&err]; + } else { + results = visionScores; } + // Setup taxonomy VCPTaxonomy *taxonomy = [VisionCameraPluginInatVisionPlugin taxonomyWithTaxonomyFile:taxonomyPath]; - - // Setup vision model - VNCoreMLModel *visionModel = [VisionCameraPluginInatVisionPlugin visionModelWithModelFile:modelPath]; - - // Setup top branches + NSMutableArray *topBranches = [NSMutableArray array]; - VNRequestCompletionHandler recognitionHandler = ^(VNRequest * _Nonnull request, NSError * _Nullable error) { - VNCoreMLFeatureValueObservation *firstResult = request.results.firstObject; - MLFeatureValue *firstFV = firstResult.featureValue; - MLMultiArray *visionScores = firstFV.multiArrayValue; - - MLMultiArray *mm = nil; - if (geoModelPreds != nil) { - NSError *err = nil; - mm = [self combineVisionScores:visionScores with:geoModelPreds error:&err]; - mm = [self normalizeMultiArray:mm error:&err]; - } else { - mm = visionScores; - } - - NSArray *bestBranch = [taxonomy inflateTopBranchFromClassification:mm]; - // add this to the end of the recent top branches array - [topBranches addObject:bestBranch]; - }; - - VNCoreMLRequest *objectRecognition = [[VNCoreMLRequest alloc] initWithModel:visionModel - completionHandler:recognitionHandler]; - objectRecognition.imageCropAndScaleOption = VNImageCropAndScaleOptionCenterCrop; - NSArray *requests = @[objectRecognition]; - - VNImageRequestHandler *handler = [[VNImageRequestHandler alloc] initWithCVPixelBuffer:pixelBuffer - orientation:orientation - options:@{}]; - NSError *requestError = nil; - [handler performRequests:requests - error:&requestError]; - if (requestError) { - NSString *errString = [NSString stringWithFormat:@"got a request error: %@", - requestError.localizedDescription]; - NSLog(@"%@", errString); - return nil; - } + NSArray *bestBranch = [taxonomy inflateTopBranchFromClassification:results]; + // add this to the end of the recent top branches array + [topBranches addObject:bestBranch]; // convert the VCPPredictions in the bestRecentBranch into dicts NSMutableArray *bestBranchAsDict = [NSMutableArray array]; From e872ec5cb3e33f1a19ee83566f80fbf60fd9dd8b Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 11:52:53 -0700 Subject: [PATCH 03/16] clean up errors --- .../VisionCameraPluginInatVision.m | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m index bcf9371..9b1ef5c 100644 --- a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m +++ b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m @@ -72,11 +72,12 @@ - (instancetype)initWithProxy:(VisionCameraProxyHolder*)proxy - (MLMultiArray *)combineVisionScores:(MLMultiArray *)visionScores with:(MLMultiArray *)geoScores error:(NSError **)error { // Ensure both arrays have the same shape if (![visionScores.shape isEqualToArray:geoScores.shape]) { - if (error) { - *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" - code:1 - userInfo:@{NSLocalizedDescriptionKey: @"Arrays must have the same shape"}]; - } + NSDictionary *userInfo = @{ + NSLocalizedDescriptionKey: @"Arrays must have the same shape", + }; + *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" + code:1 + userInfo:userInfo]; return nil; } @@ -85,6 +86,12 @@ - (MLMultiArray *)combineVisionScores:(MLMultiArray *)visionScores with:(MLMulti dataType:MLMultiArrayDataTypeDouble error:error]; if (!combinedArray) { + NSDictionary *userInfo = @{ + NSLocalizedDescriptionKey: @"Failed to make combined array", + }; + *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" + code:2 + userInfo:userInfo]; return nil; } @@ -113,11 +120,12 @@ - (MLMultiArray *)normalizeMultiArray:(MLMultiArray *)mlArray error:(NSError **) if (sum != 0) { vDSP_vsdivD(mlData, 1, &sum, mlData, 1, count); } else { - if (error) { - *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" - code:2 - userInfo:@{NSLocalizedDescriptionKey: @"Sum of elements is zero, normalization not possible."}]; - } + NSDictionary *userInfo = @{ + NSLocalizedDescriptionKey: @"Sum of elements is zero, normalization not possible." + }; + *error = [NSError errorWithDomain:@"MLMultiArrayErrorDomain" + code:3 + userInfo:userInfo]; return nil; } From 5702e470a7f52fbcf9946ae008a5cdfed49193b7 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 11:53:43 -0700 Subject: [PATCH 04/16] make this combine method nullable so we can put something in error and return nil --- ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m index 9b1ef5c..daf1b2b 100644 --- a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m +++ b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m @@ -69,7 +69,7 @@ - (instancetype)initWithProxy:(VisionCameraProxyHolder*)proxy return self; } -- (MLMultiArray *)combineVisionScores:(MLMultiArray *)visionScores with:(MLMultiArray *)geoScores error:(NSError **)error { +- (MLMultiArray * _Nullable)combineVisionScores:(MLMultiArray *)visionScores with:(MLMultiArray *)geoScores error:(NSError **)error { // Ensure both arrays have the same shape if (![visionScores.shape isEqualToArray:geoScores.shape]) { NSDictionary *userInfo = @{ From f6f4888b96cfe1d4cc14d60d31190de570c09373 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 11:55:09 -0700 Subject: [PATCH 05/16] convert all DSP ops to float --- .../VisionCameraPluginInatVision.m | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m index daf1b2b..42747cc 100644 --- a/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m +++ b/ios/VisionCameraPluginInatVision/VisionCameraPluginInatVision.m @@ -83,7 +83,7 @@ - (MLMultiArray * _Nullable)combineVisionScores:(MLMultiArray *)visionScores wit // Create a result MLMultiArray with the same shape as the input arrays MLMultiArray *combinedArray = [[MLMultiArray alloc] initWithShape:visionScores.shape - dataType:MLMultiArrayDataTypeDouble + dataType:MLMultiArrayDataTypeFloat32 error:error]; if (!combinedArray) { NSDictionary *userInfo = @{ @@ -96,29 +96,28 @@ - (MLMultiArray * _Nullable)combineVisionScores:(MLMultiArray *)visionScores wit } // Get the data pointers - double *visionData = (double *)visionScores.dataPointer; - double *geoData = (double *)geoScores.dataPointer; - double *combinedData = (double *)combinedArray.dataPointer; - + float *visionData = (float *)visionScores.dataPointer; + float *geoData = (float *)geoScores.dataPointer; + float *combinedData = (float *)combinedArray.dataPointer; + // Get the number of elements NSInteger count = visionScores.count; - + // Perform element-wise multiplication using vDSP_vmul - vDSP_vmulD(visionData, 1, geoData, 1, combinedData, 1, count); - + vDSP_vmul(visionData, 1, geoData, 1, combinedData, 1, count); + return combinedArray; } - (MLMultiArray *)normalizeMultiArray:(MLMultiArray *)mlArray error:(NSError **)error { NSInteger count = mlArray.count; - double *mlData = (double *)mlArray.dataPointer; + float *mlData = (float *)mlArray.dataPointer; - double sum = 0.0; - vDSP_sveD(mlData, 1, &sum, count); + float sum = 0.0; + vDSP_sve(mlData, 1, &sum, count); - // Normalize by dividing each element by the sum if (sum != 0) { - vDSP_vsdivD(mlData, 1, &sum, mlData, 1, count); + vDSP_vsdiv(mlData, 1, &sum, mlData, 1, count); } else { NSDictionary *userInfo = @{ NSLocalizedDescriptionKey: @"Sum of elements is zero, normalization not possible." From 9d85ca1d5e53d6e73e874abf952ff422c41e5e01 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 11:55:54 -0700 Subject: [PATCH 06/16] apply a cutoff before taxonomy cutoff from 550ms per frame to 120ms per frame on my iPhone 13 pro --- ios/Classifier/VCPTaxonomy.m | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ios/Classifier/VCPTaxonomy.m b/ios/Classifier/VCPTaxonomy.m index 2cbd9d0..c2e20e8 100644 --- a/ios/Classifier/VCPTaxonomy.m +++ b/ios/Classifier/VCPTaxonomy.m @@ -16,6 +16,7 @@ @interface VCPTaxonomy () // this is a convenience array for testing @property NSArray *leaves; @property VCPNode *life; +@property float taxonomyRollupCutoff; @end @implementation VCPTaxonomy @@ -83,6 +84,8 @@ - (instancetype)initWithTaxonomyFile:(NSString *)taxaFile { [self.life addChild:node]; } } + + self.taxonomyRollupCutoff = 0.01; } return self; @@ -133,32 +136,33 @@ - (VCPPrediction *)inflateTopPredictionFromClassification:(MLMultiArray *)classi // following // https://github.com/inaturalist/inatVisionAPI/blob/multiclass/inferrers/multi_class_inferrer.py#L136 - (NSDictionary *)aggregateScores:(MLMultiArray *)classification currentNode:(VCPNode *)node { - if (node.children.count > 0) { - // we'll populate this and return it - NSMutableDictionary *allScores = [NSMutableDictionary dictionary]; - - for (VCPNode *child in node.children) { - NSDictionary *childScores = [self aggregateScores:classification currentNode:child]; - [allScores addEntriesFromDictionary:childScores]; - } + NSMutableDictionary *allScores = [NSMutableDictionary dictionary]; + if (node.children.count > 0) { float thisScore = 0.0f; for (VCPNode *child in node.children) { - thisScore += [allScores[child.taxonId] floatValue]; + NSDictionary *childScores = [self aggregateScores:classification currentNode:child]; + NSNumber *childScore = childScores[child.taxonId]; + + if ([childScore floatValue] > self.taxonomyRollupCutoff) { + [allScores addEntriesFromDictionary:childScores]; + thisScore += [childScore floatValue]; + } } - allScores[node.taxonId] = @(thisScore); - return [NSDictionary dictionaryWithDictionary:allScores]; } else { // base case, no children NSAssert(node.leafId, @"node with taxonId %@ has no children but also has no leafId", node.taxonId); NSNumber *leafScore = [classification objectAtIndexedSubscript:node.leafId.integerValue]; NSAssert(leafScore, @"node with leafId %@ has no score", node.leafId); - return @{ - node.taxonId: leafScore - }; + + if ([leafScore floatValue] > self.taxonomyRollupCutoff) { + allScores[node.taxonId] = leafScore; + } } + + return [allScores copy]; } - (NSDictionary *)aggregateScores:(MLMultiArray *)classification { From 032bb0f228fdd4115fada54b73be4dfe9c73c712 Mon Sep 17 00:00:00 2001 From: Alex Shepard Date: Sat, 19 Oct 2024 12:04:04 -0700 Subject: [PATCH 07/16] refactor geo model into its own class add geo model result caching (saves about 5ms per frame) --- example/src/App.tsx | 34 ++++++ ios/Classifier/VCPGeoModel.h | 24 ++++ ios/Classifier/VCPGeoModel.m | 111 ++++++++++++++++++ .../project.pbxproj | 6 + .../VisionCameraPluginInatVision.m | 103 ++-------------- src/index.tsx | 6 + 6 files changed, 189 insertions(+), 95 deletions(-) create mode 100644 ios/Classifier/VCPGeoModel.h create mode 100644 ios/Classifier/VCPGeoModel.m diff --git a/example/src/App.tsx b/example/src/App.tsx index e516b74..586275d 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -29,12 +29,17 @@ const modelFilenameAndroid = 'small_inception_tf1.tflite'; const taxonomyFilenameAndroid = 'small_export_tax.csv'; const modelFilenameIOS = 'small_inception_tf1.mlmodelc'; const taxonomyFilenameIOS = 'small_export_tax.json'; +const geoModelFilenameIOS = 'small_geomodel.mlmodelc'; const modelVersion = '1.0'; const modelPath = Platform.OS === 'ios' ? `${RNFS.DocumentDirectoryPath}/${modelFilenameIOS}` : `${RNFS.DocumentDirectoryPath}/${modelFilenameAndroid}`; +const geoModelPath = + Platform.OS === 'ios' + ? `${RNFS.DocumentDirectoryPath}/${geoModelFilenameIOS}` + : `${RNFS.DocumentDirectoryPath}/${modelFilenameAndroid}`; const taxonomyPath = Platform.OS === 'ios' ? `${RNFS.DocumentDirectoryPath}/${taxonomyFilenameIOS}` @@ -49,6 +54,7 @@ export default function App(): React.JSX.Element { undefined ); const [negativeFilter, setNegativeFilter] = useState(false); + const [useGeoModel, setUseGeoModel] = useState(false); enum VIEW_STATUS { NONE, @@ -66,6 +72,10 @@ export default function App(): React.JSX.Element { setNegativeFilter(!negativeFilter); }; + const toggleUseGeoModel = () => { + setUseGeoModel(!useGeoModel); + }; + const changeFilterByTaxonId = () => { if (!filterByTaxonId) { setFilterByTaxonId('47126'); @@ -110,6 +120,16 @@ export default function App(): React.JSX.Element { .catch((error) => { console.log(`error moving model file`, error); }); + RNFS.copyFile( + `${RNFS.MainBundlePath}/${geoModelFilenameIOS}`, + `${RNFS.DocumentDirectoryPath}/${geoModelFilenameIOS}` + ) + .then((result) => { + console.log(`moved geo model file from`, result); + }) + .catch((error) => { + console.log(`error moving geo model file`, error); + }); RNFS.copyFile( `${RNFS.MainBundlePath}/${taxonomyFilenameIOS}`, `${RNFS.DocumentDirectoryPath}/${taxonomyFilenameIOS}` @@ -147,6 +167,11 @@ export default function App(): React.JSX.Element { 'worklet'; try { const timeBefore = new Date().getTime(); + + const latitude = 37.28889; + const longitude = -121.94415; + const elevation = 15.0; + const cvResult: InatVision.Result = InatVision.inatVision(frame, { version: modelVersion, modelPath, @@ -156,6 +181,11 @@ export default function App(): React.JSX.Element { negativeFilter, numStoredResults: 4, cropRatio: 0.9, + latitude, + longitude, + elevation, + geoModelPath, + useGeoModel, }); const timeAfter = new Date().getTime(); console.log('time taken ms: ', timeAfter - timeBefore); @@ -316,6 +346,10 @@ export default function App(): React.JSX.Element { onPress={() => setViewStatus(VIEW_STATUS.NONE)} title="Close" /> +