之前一直用腾讯云的OCR API功能,她的高精度OCR还是挺好用的,但续费的价格实在感人,1000次应用就要200元,见下图。
考虑到之前使用kimi的图像识别功能还不错,就试着用AI大模型功能来进行OCR,一般来说,识别精度在AI的加持下比一般API要高。但官网也只给了python及node的代码,为了适应自己的WP采集发贴程序,现修改为C#代码的形式:
namespace wordpress采集项目
{
public class KimiOcr
{
// 从环境变量中获取 API 密钥,也可以直接硬编码(不推荐生产环境中使用硬编码)
private static readonly string ApiKey = "sk-XXXXXX";//你的APIKEY
private static readonly string BaseUrl = "https://api.moonshot.cn/v1";
/// <summary>
/// 使用 Kimi Vision 模型对图片进行 OCR 识别,直接通过提示词让大模型识别图片中的文字。
/// </summary>
/// <param name="imagePath">图片文件路径</param>
/// <returns>识别出的文字</returns>
public static async Task<string> RecognizeImageText(string imagePath)
{
try
{
// 读取图片文件并转换为 Base64 字符串
byte[] imageBytes = File.ReadAllBytes(imagePath);
string extension = Path.GetExtension(imagePath).TrimStart('.').ToLower(); // 如 "png", "jpg"
string imageBase64 = Convert.ToBase64String(imageBytes);
string imageDataUrl = $"data:image/{extension};base64,{imageBase64}";
// 构造请求 payload
var requestPayload = new ChatRequest
{
model = "moonshot-v1-8k-vision-preview", // 根据实际情况调整模型名称
messages = new ChatMessage[]
{
new ChatMessage { role = "system", content = "你是 Kimi。" },
new ChatMessage
{
role = "user",
// 内容由多个部分组成:图片和文本指令
content = new List<ContentPart>
{
new ContentPart
{
type = "image_url",
image_url = new ImageUrlPart { url = imageDataUrl }
},
new ContentPart
{
type = "text",
text = "请识别图片中的文字。"
}
}
}
}
};
string jsonPayload = JsonConvert.SerializeObject(requestPayload);
using (HttpClient client = new HttpClient())
{
// 设置认证头部和请求头
client.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", ApiKey);
client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));
var contentData = new StringContent(jsonPayload, System.Text.Encoding.UTF8, "application/json");
HttpResponseMessage response = await client.PostAsync($"{BaseUrl}/chat/completions", contentData);
response.EnsureSuccessStatusCode();
string jsonResponse = await response.Content.ReadAsStringAsync();
var chatResponse = JsonConvert.DeserializeObject<ChatResponse>(jsonResponse);
return chatResponse.choices[0].message.content;
}
}
catch (Exception ex)
{
Console.WriteLine($"OCR 识别失败:{ex.Message}");
return string.Empty;
}
}
}
public class ChatRequest
{
public string model { get; set; }
public ChatMessage[] messages { get; set; }
}
public class ChatMessage
{
public string role { get; set; }
public object content { get; set; } // 可以是字符串或 List<ContentPart>
}
public class ContentPart
{
public string type { get; set; }
public ImageUrlPart image_url { get; set; }
public string text { get; set; }
}
public class ImageUrlPart
{
public string url { get; set; }
}
public class ChatResponse
{
public Choice[] choices { get; set; }
}
public class Choice
{
public ChatMessageResponse message { get; set; }
}
public class ChatMessageResponse
{
public string content { get; set; }
}
}
使用方法:
string ocrText = string.Empty;
//ocrText = await PerformOcr(outputImagePath, checkBox4.Checked?1:0);//1为高精度识别
ocrText = await KimiOcr.RecognizeImageText(outputImagePath);
根据Kimi官网的说法:每张图片消耗的 Tokens 为固定值 1024(不区分图片尺寸及图片质量)。Vision 模型在计费方式上与 moonshot-v1
系列模型保持一致,那么计算一下Money的消耗如下:
根据模型推理的总 Tokens 计费,1M Tokens为12元,则每张图片大约消耗1024/1M*12=0.012元/每张图片。比起腾讯云的0.2元/每张图片,价格降为腾讯的6%,且识别度更高,可以说是既好又便宜,可以放开来用。
评论0