mirror of https://github.com/alibaba/MNN.git
217 lines
6.9 KiB
Objective-C
217 lines
6.9 KiB
Objective-C
//
|
|
// LLMInferenceEngineWrapper.h
|
|
// mnn-llm
|
|
//
|
|
// Created by wangzhaode on 2023/12/14.
|
|
//
|
|
|
|
#ifndef LLMInferenceEngineWrapper_h
|
|
#define LLMInferenceEngineWrapper_h
|
|
|
|
#import <Foundation/Foundation.h>
|
|
|
|
NS_ASSUME_NONNULL_BEGIN
|
|
|
|
typedef void (^CompletionHandler)(BOOL success);
|
|
typedef void (^OutputHandler)(NSString * _Nonnull output);
|
|
|
|
// MARK: - Benchmark Related Types
|
|
|
|
/**
|
|
* Progress type enumeration for structured benchmark reporting
|
|
*/
|
|
typedef NS_ENUM(NSInteger, BenchmarkProgressType) {
|
|
BenchmarkProgressTypeUnknown = 0,
|
|
BenchmarkProgressTypeInitializing = 1,
|
|
BenchmarkProgressTypeWarmingUp = 2,
|
|
BenchmarkProgressTypeRunningTest = 3,
|
|
BenchmarkProgressTypeProcessingResults = 4,
|
|
BenchmarkProgressTypeCompleted = 5,
|
|
BenchmarkProgressTypeStopping = 6
|
|
};
|
|
|
|
/**
|
|
* Structured progress information for benchmark
|
|
*/
|
|
@interface BenchmarkProgressInfo : NSObject
|
|
|
|
@property (nonatomic, assign) NSInteger progress; // 0-100
|
|
@property (nonatomic, strong) NSString *statusMessage; // Status description
|
|
@property (nonatomic, assign) BenchmarkProgressType progressType;
|
|
@property (nonatomic, assign) NSInteger currentIteration;
|
|
@property (nonatomic, assign) NSInteger totalIterations;
|
|
@property (nonatomic, assign) NSInteger nPrompt;
|
|
@property (nonatomic, assign) NSInteger nGenerate;
|
|
@property (nonatomic, assign) float runTimeSeconds;
|
|
@property (nonatomic, assign) float prefillTimeSeconds;
|
|
@property (nonatomic, assign) float decodeTimeSeconds;
|
|
@property (nonatomic, assign) float prefillSpeed;
|
|
@property (nonatomic, assign) float decodeSpeed;
|
|
|
|
@end
|
|
|
|
/**
|
|
* Benchmark result structure
|
|
*/
|
|
@interface BenchmarkResult : NSObject
|
|
|
|
@property (nonatomic, assign) BOOL success;
|
|
@property (nonatomic, strong, nullable) NSString *errorMessage;
|
|
@property (nonatomic, strong) NSArray<NSNumber *> *prefillTimesUs;
|
|
@property (nonatomic, strong) NSArray<NSNumber *> *decodeTimesUs;
|
|
@property (nonatomic, strong) NSArray<NSNumber *> *sampleTimesUs;
|
|
@property (nonatomic, assign) NSInteger promptTokens;
|
|
@property (nonatomic, assign) NSInteger generateTokens;
|
|
@property (nonatomic, assign) NSInteger repeatCount;
|
|
@property (nonatomic, assign) BOOL kvCacheEnabled;
|
|
|
|
@end
|
|
|
|
// Benchmark callback blocks
|
|
typedef void (^BenchmarkProgressCallback)(BenchmarkProgressInfo *progressInfo);
|
|
typedef void (^BenchmarkErrorCallback)(NSString *error);
|
|
typedef void (^BenchmarkIterationCompleteCallback)(NSString *detailedStats);
|
|
typedef void (^BenchmarkCompleteCallback)(BenchmarkResult *result);
|
|
|
|
/**
|
|
* LLMInferenceEngineWrapper - A high-level Objective-C wrapper for MNN LLM inference engine
|
|
*
|
|
* This class provides a convenient interface for integrating MNN's Large Language Model
|
|
* inference capabilities into iOS applications with enhanced error handling, performance
|
|
* optimization, and thread safety.
|
|
*/
|
|
@interface LLMInferenceEngineWrapper : NSObject
|
|
|
|
/**
|
|
* Initialize the LLM inference engine with a model path
|
|
*
|
|
* @param modelPath The file system path to the model directory
|
|
* @param completion Completion handler called with success/failure status
|
|
* @return Initialized instance of LLMInferenceEngineWrapper
|
|
*/
|
|
- (instancetype)initWithModelPath:(NSString *)modelPath completion:(CompletionHandler)completion;
|
|
|
|
/**
|
|
* Process user input and generate streaming LLM response
|
|
*
|
|
* @param input The user's input text to process
|
|
* @param output Callback block that receives streaming output chunks
|
|
*/
|
|
- (void)processInput:(NSString *)input withOutput:(OutputHandler)output;
|
|
|
|
/**
|
|
* Process user input and generate streaming LLM response with optional performance output
|
|
*
|
|
* @param input The user's input text to process
|
|
* @param output Callback block that receives streaming output chunks
|
|
* @param showPerformance Whether to output performance statistics after response completion
|
|
*/
|
|
- (void)processInput:(NSString *)input withOutput:(OutputHandler)output showPerformance:(BOOL)showPerformance;
|
|
|
|
/**
|
|
* Add chat prompts from an array of dictionaries to the conversation history
|
|
*
|
|
* @param array NSArray containing NSDictionary objects with chat messages
|
|
*/
|
|
- (void)addPromptsFromArray:(NSArray<NSDictionary *> *)array;
|
|
|
|
/**
|
|
* Set the configuration for the LLM engine using a JSON string
|
|
*
|
|
* @param jsonStr JSON string containing configuration parameters
|
|
*/
|
|
- (void)setConfigWithJSONString:(NSString *)jsonStr;
|
|
|
|
/**
|
|
* Set thinking mode for the LLM engine
|
|
*
|
|
* @param enabled Whether to enable thinking mode
|
|
*/
|
|
- (void)setThinkingModeEnabled:(BOOL)enabled;
|
|
|
|
/**
|
|
* Check if model is ready for inference
|
|
*
|
|
* @return YES if model is loaded and ready
|
|
*/
|
|
- (BOOL)isModelReady;
|
|
|
|
/**
|
|
* Get current processing status
|
|
*
|
|
* @return YES if currently processing an inference request
|
|
*/
|
|
- (BOOL)isProcessing;
|
|
|
|
/**
|
|
* Cancel ongoing inference (if supported)
|
|
*/
|
|
- (void)cancelInference;
|
|
|
|
/**
|
|
* Get chat history count
|
|
*
|
|
* @return Number of messages in chat history
|
|
*/
|
|
- (NSUInteger)getChatHistoryCount;
|
|
|
|
/**
|
|
* Clear chat history
|
|
*/
|
|
- (void)clearChatHistory;
|
|
|
|
// MARK: - Benchmark Methods
|
|
|
|
/**
|
|
* Run official benchmark following llm_bench.cpp approach
|
|
*
|
|
* @param backend Backend type (0 for CPU)
|
|
* @param threads Number of threads
|
|
* @param useMmap Whether to use memory mapping
|
|
* @param power Power setting
|
|
* @param precision Precision setting (2 for low precision)
|
|
* @param memory Memory setting (2 for low memory)
|
|
* @param dynamicOption Dynamic optimization option
|
|
* @param nPrompt Number of prompt tokens
|
|
* @param nGenerate Number of tokens to generate
|
|
* @param nRepeat Number of repetitions
|
|
* @param kvCache Whether to use KV cache
|
|
* @param progressCallback Progress update callback
|
|
* @param errorCallback Error callback
|
|
* @param iterationCompleteCallback Iteration completion callback
|
|
* @param completeCallback Final completion callback
|
|
*/
|
|
- (void)runOfficialBenchmarkWithBackend:(NSInteger)backend
|
|
threads:(NSInteger)threads
|
|
useMmap:(BOOL)useMmap
|
|
power:(NSInteger)power
|
|
precision:(NSInteger)precision
|
|
memory:(NSInteger)memory
|
|
dynamicOption:(NSInteger)dynamicOption
|
|
nPrompt:(NSInteger)nPrompt
|
|
nGenerate:(NSInteger)nGenerate
|
|
nRepeat:(NSInteger)nRepeat
|
|
kvCache:(BOOL)kvCache
|
|
progressCallback:(BenchmarkProgressCallback _Nullable)progressCallback
|
|
errorCallback:(BenchmarkErrorCallback _Nullable)errorCallback
|
|
iterationCompleteCallback:(BenchmarkIterationCompleteCallback _Nullable)iterationCompleteCallback
|
|
completeCallback:(BenchmarkCompleteCallback _Nullable)completeCallback;
|
|
|
|
/**
|
|
* Stop running benchmark
|
|
*/
|
|
- (void)stopBenchmark;
|
|
|
|
/**
|
|
* Check if benchmark is currently running
|
|
*
|
|
* @return YES if benchmark is running
|
|
*/
|
|
- (BOOL)isBenchmarkRunning;
|
|
|
|
@end
|
|
|
|
NS_ASSUME_NONNULL_END
|
|
|
|
#endif /* LLMInferenceEngineWrapper_h */
|