基于ASP.NET CORE MVC 3.1 打造百度AI智能体验平台：从人脸/手势识别到语音识别控制

程序员文章站 2022-06-25 21:46:36

前言停更太久了，倒也不是多忙，主要是学习的热情降低了，又比较懒，因此，即使有做出新的玩意或者有所收获，也懒得去码字。最近做了一个百度AI的web端玩具，可以人脸识别/注册/颜值打分/手势识别，最骚的是可以通过语音进行相关指令的控制，大概就长下面这样。人脸识别部分包括人脸注册/人脸1：n识别，颜值打分这块，在上以一个文章：百度人脸识别HTTP SDk实战：基于C# ASP.NET CORE MVC 3.1上讲的比较细了，颜值打分就是在之前的代码里，加一个参数，具体看百度文档就知道，这里不再进行论述，本文....

前言

停更太久了，倒也不是多忙，主要是学习的热情降低了，又比较懒，因此，即使有做出新的玩意或者有所收获，也懒得去码字。最近做了一个百度AI的web端玩具，可以人脸识别/注册/颜值打分/手势识别，最骚的是可以通过语音进行相关指令的控制，大概就长下面这样。人脸识别部分包括人脸注册/人脸1：n识别，颜值打分这块，在上以一个文章：百度人脸识别HTTP SDk实战：基于C# ASP.NET CORE MVC 3.1 上讲的比较细了，颜值打分就是在之前的代码里，加一个参数，具体看百度文档就知道，这里不再进行论述，本文主要讲解手势识别以及语音识别部分代码逻辑。

手势识别

效果图如下：支持24种手势，具体：https://cloud.baidu.com/doc/BODY/s/Dk3cpyr8l

基于ASP.NET CORE MVC 3.1 打造百度AI智能体验平台：从人脸/手势识别到语音识别控制

代码逻辑

这一块官方文档已经有很好的C#示例了，稍微修改一下就可以用了

控制器逻辑

 public string GetAccessToken()
        {
            string authHost = "https://aip.baidubce.com/oauth/2.0/token";
            HttpClient client = new HttpClient();
            List<KeyValuePair<String, String>> paraList = new List<KeyValuePair<string, string>>();
            paraList.Add(new KeyValuePair<string, string>("grant_type", "client_credentials"));
            paraList.Add(new KeyValuePair<string, string>("client_id", _configuration["BaiduAiConfig:BaiDuGestureRecon:ApiKey_Gesture"]));
            paraList.Add(new KeyValuePair<string, string>("client_secret",
                _configuration["BaiduAiConfig:BaiDuGestureRecon:SecretKey_Gesture"]));

            HttpResponseMessage response = client.PostAsync(authHost, new FormUrlEncodedContent(paraList)).Result;
            string result = response.Content.ReadAsStringAsync().Result;
            var resultJson = JsonConvert.DeserializeObject<JObject>(result);
            AccessToken = resultJson["access_token"].ToString();
            return AccessToken;
        }

        public IActionResult GestureFromWeb(string imgData64FromAjax)
        {
            GetAccessToken();
            string host = "https://aip.baidubce.com/rest/2.0/image-classify/v1/gesture?access_token=" + AccessToken;
            Encoding encoding = Encoding.Default;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(host);
            request.Method = "post";
            request.KeepAlive = true;
            // 图片的base64编码
            //  string base64 = GetFileBase64("[本地图片文件]");
            string requestImgData64 = imgData64FromAjax;
            requestImgData64 = requestImgData64.Substring(requestImgData64.IndexOf(",") + 1);
            String str = "image=" + HttpUtility.UrlEncode(requestImgData64);
            byte[] buffer = encoding.GetBytes(str);
            request.ContentLength = buffer.Length;
            request.GetRequestStream().Write(buffer, 0, buffer.Length);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.Default);
            string result = reader.ReadToEnd();
            var resultJson = JsonConvert.DeserializeObject<JObject>(result);
            if (int.Parse(resultJson["result_num"].ToString()) != 0)
            {
                string gestureToken = resultJson["result"][0]["classname"].ToString();
                GestureResultDict resultDict = new GestureResultDict();
                try
                {
                    //这里写了一个手势映射的字典
                    string resultStr = resultDict.resultDict.FirstOrDefault(x => x.Key == gestureToken).Value;
                    if (!string.IsNullOrWhiteSpace(resultStr))
                    {
                        return Json(resultStr);
                    }
                    return Json("无法识别手势");
                }
                catch
                {
                    return Json("无法识别手势");
                }
            }
            return RedirectToAction("index", "home");
        }

手势映射字典


        private Dictionary<string, string> results = new Dictionary<string, string>()
        {
            {"Ok","Ok" },
            {"Six","数字6" },
            {"Rock","Rock" },
            {"Thumb_up","点赞" },
            {"One","数字1" },
            {"Five","数字5" },
            {"Fist","拳头" },
            {"Prayer","上天保佑" },
            {"Congratulation","恭喜恭喜" },
            {"Heart_single","笔芯" },
            {"Thumb_down","鄙视你" },
            {"ILY","黑凤梨" },
            { "Insult","竖中指"},
            { "Nine", "数字9" },
            { "Eight","数字8"},
            { "Seven","数字7"},
            { "Four","数字4"},
            { "Tow","数字2/Yeah"}
        };

这里主要是从前端传来base64的图片编码，前端逻辑采用了Jquery 。

 function reconGesture() {
        let video = document.getElementById("video");
        var canvas = $('#canvasGesture')[0];
        let ctx = canvas.getContext('2d');
        //
        canvas.height = 465;
        canvas.width = 400;
        ctx.drawImage(video, 0, 0, 400, 400);
        ctx.scale(-1, 1);
        var img = convertCanvasToImage(canvas);
        $.ajax({
            url: '/Gesture/GestureFromWeb',
            type: 'post',
            dataType: 'json',
            data: { "imgData64FromAjax": img.src },
            success: function (jsonStr) {
                var data = JSON.stringify(jsonStr);
                console.log(data);
                $("#gestureText").html("手势识别结果为：" + data);

            }
        })

        //let img = document.getElementById("canvas").toDataURL("image/png");
        //var triggerDownload = $("#downloadA").attr("href", img).attr("download", "micro-blog.png");
        //triggerDownload[0].click();
    }


  //从 canvas 提取图片 image
    function convertCanvasToImage(canvas) {
        //新Image对象，可以理解为DOM
        var image = new Image();
        // canvas.toDataURL 返回的是一串Base64编码的URL
        // 指定格式 PNG
        image.src = canvas.toDataURL("image/png");
        return image;
    }

语音控制

语音识别这块笔记麻烦一些，录音软件采用了Recorder.js作为插件进行录音上传，由于百度语音识别对于语音样本有如下要求：

原始 PCM 的录音参数必须符合 16k、8k 采样率、16bit 位深、单声道，支持的格式有：pcm（不压缩）、wav（不压缩，pcm编码）、amr（压缩格式）。所以对Recorder.js进行了一些代码上的修改进行配置。配置后的完整js代码文件：

Recorder.js完整代码

(function (f) { if (typeof exports === "object" && typeof module !== "undefined") { module.exports = f() } else if (typeof define === "function" && define.amd) { define([], f) } else { var g; if (typeof window !== "undefined") { g = window } else if (typeof global !== "undefined") { g = global } else if (typeof self !== "undefined") { g = self } else { g = this } g.Recorder = f() } })(function () {
    var define, module, exports; return (function e(t, n, r) { function s(o, u) { if (!n[o]) { if (!t[o]) { var a = typeof require == "function" && require; if (!u && a) return a(o, !0); if (i) return i(o, !0); var f = new Error("Cannot find module '" + o + "'"); throw f.code = "MODULE_NOT_FOUND", f } var l = n[o] = { exports: {} }; t[o][0].call(l.exports, function (e) { var n = t[o][1][e]; return s(n ? n : e) }, l, l.exports, e, t, n, r) } return n[o].exports } var i = typeof require == "function" && require; for (var o = 0; o < r.length; o++)s(r[o]); return s })({
        1: [function (require, module, exports) {
            "use strict";

            module.exports = require("./recorder").Recorder;
        }, { "./recorder": 2 }], 2: [function (require, module, exports) {
            'use strict';

            var _createClass = (function () {
                function defineProperties(target, props) {
                    for (var i = 0; i < props.length; i++) {
                        var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor);
                    }
                } return function (Constructor, protoProps, staticProps) {
                    if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor;
                };
            })();

            Object.defineProperty(exports, "__esModule", {
                value: true
            });
            exports.Recorder = undefined;

            var _inlineWorker = require('inline-worker');

            var _inlineWorker2 = _interopRequireDefault(_inlineWorker);

            function _interopRequireDefault(obj) {
                return obj && obj.__esModule ? obj : { default: obj };
            }

            function _classCallCheck(instance, Constructor) {
                if (!(instance instanceof Constructor)) {
                    throw new TypeError("Cannot call a class as a function");
                }
            }

            var Recorder = exports.Recorder = (function () {
                function Recorder(source, cfg) {
                    var _this = this;

                    _classCallCheck(this, Recorder);

                    this.config = {
                        bufferLen: 4096,
                        numChannels: 2,
                        mimeType: 'audio_pcm/wav'
                    };
                    this.recording = false;

                    this.callbacks = {
                        getBuffer: [],
                        exportWAV: []
                    };

                    Object.assign(this.config, cfg);
                    this.context = source.context;

                    this.node = (this.context.createScriptProcessor || this.context.createJavaScriptNode).call(this.context, this.config.bufferLen, this.config.numChannels, this.config.numChannels);

                    this.node.onaudioprocess = function (e) {
                        if (!_this.recording) return;

                        var buffer = [];
                        for (var channel = 0; channel < _this.config.numChannels; channel++) {
                            buffer.push(e.inputBuffer.getChannelData(channel));
                        }
                        _this.worker.postMessage({
                            command: 'record',
                            buffer: buffer
                        });
                    };

                    source.connect(this.node);
                    this.node.connect(this.context.destination); //this should not be necessary

                    var self = {};
                    this.worker = new _inlineWorker2.default(function () {
                        var recLength = 0,
                            recBuffers = [],
                            sampleRate = undefined,
                            numChannels = undefined;

                        //  var sampleStep = this.context.sampleRate / sampleRate;
                        self.onmessage = function (e) {
                            switch (e.data.command) {
                                case 'init':
                                    init(e.data.config);
                                    break;
                                case 'record':
                                    record(e.data.buffer);
                                    break;
                                case 'exportWAV':
                                    exportWAV(e.data.type);
                                    break;
                                case 'getBuffer':
                                    getBuffer();
                                    break;
                                case 'clear':
                                    clear();
                                    break;
                            }
                        };

                        function init(config) {
                            sampleRate = config.sampleRate;
                            numChannels = config.numChannels;
                            initBuffers();
                        }

                        function record(inputBuffer) {
                            for (var channel = 0; channel < numChannels; channel++) {
                                recBuffers[channel].push(inputBuffer[channel]);
                            }
                            recLength += inputBuffer[0].length;
                        }

                        function exportWAV(type) {
                            var buffers = [];
                            for (var channel = 0; channel < numChannels; channel++) {
                                buffers.push(mergeBuffers(recBuffers[channel], recLength));
                            }
                            var interleaved = undefined;
                            if (numChannels === 2) {
                                interleaved = interleave(buffers[0], buffers[1]);
                            } else {
                                //处理单声道
                                interleaved = extractSingleChannel(buffers[0]);
                            }
                            var dataview = encodeWAV(interleaved);
                            var audioBlob = new Blob([dataview], { type: type });

                            self.postMessage({ command: 'exportWAV', data: audioBlob });
                        }

                        function getBuffer() {
                            var buffers = [];
                            for (var channel = 0; channel < numChannels; channel++) {
                                buffers.push(mergeBuffers(recBuffers[channel], recLength));
                            }
                            self.postMessage({ command: 'getBuffer', data: buffers });
                        }

                        function clear() {
                            recLength = 0;
                            recBuffers = [];
                            initBuffers();
                        }

                        function initBuffers() {
                            for (var channel = 0; channel < numChannels; channel++) {
                                recBuffers[channel] = [];
                            }
                        }

                        function mergeBuffers(recBuffers, recLength) {
                            var result = new Float32Array(recLength);
                            var offset = 0;
                            for (var i = 0; i < recBuffers.length; i++) {
                                result.set(recBuffers[i], offset);
                                offset += recBuffers[i].length;
                            }
                            return result;
                        }

                        function interleave(inputL, inputR) {
                            var length = inputL.length + inputR.length;
                            var result = new Float32Array(length);

                            var index = 0,
                                inputIndex = 0;

                            while (index < length) {
                                result[index++] = inputL[inputIndex];
                                result[index++] = inputR[inputIndex];
                                inputIndex++;
                            }
                            return result;
                        }

                        function extractSingleChannel(input) {
                            //如果此处不按比例缩短，实际输出的文件会包含sampleStep倍长度的空录音
                            var length = Math.ceil(input.length / 1);
                            var result = new Float32Array(length);
                            var index = 0,
                                inputIndex = 0;
                            while (index < length) {
                                //此处是处理关键，算法就是输入的数据点每隔sampleStep距离取一个点放入result
                                result[index++] = input[inputIndex];
                                inputIndex += 1;
                            }
                            return result;
                        }

                        function floatTo16BitPCM(output, offset, input) {
                            for (var i = 0; i < input.length; i++, offset += 2) {
                                var s = Math.max(-1, Math.min(1, input[i]));
                                output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
                            }
                        }

                        function writeString(view, offset, string) {
                            for (var i = 0; i < string.length; i++) {
                                view.setUint8(offset + i, string.charCodeAt(i));
                            }
                        }

                        function encodeWAV(samples) {
                            var buffer = new ArrayBuffer(44 + samples.length * 2);
                            var view = new DataView(buffer);

                            /* RIFF identifier */
                            writeString(view, 0, 'RIFF');
                            /* RIFF chunk length */
                            view.setUint32(4, 36 + samples.length * 2, true);
                            /* RIFF type */
                            writeString(view, 8, 'WAVE');
                            /* format chunk identifier */
                            writeString(view, 12, 'fmt ');
                            /* format chunk length */
                            view.setUint32(16, 16, true);
                            /* sample format (raw) */
                            view.setUint16(20, 1, true);
                            /* channel count */
                            view.setUint16(22, numChannels, true);
                            /* sample rate */
                            view.setUint32(24, sampleRate, true);
                            /* byte rate (sample rate * block align) */
                            view.setUint32(28, sampleRate * 4, true);
                            /* block align (channel count * bytes per sample) */
                            view.setUint16(32, numChannels * 2, true);
                            /* bits per sample */
                            view.setUint16(34, 16, true);
                            /* data chunk identifier */
                            writeString(view, 36, 'data');
                            /* data chunk length */
                            view.setUint32(40, samples.length * 2, true);

                            floatTo16BitPCM(view, 44, samples);

                            return view;
                        }
                    }, self);

                    this.worker.postMessage({
                        command: 'init',
                        config: {
                            sampleRate: this.context.sampleRate,
                            numChannels: this.config.numChannels
                        }
                    });

                    this.worker.onmessage = function (e) {
                        var cb = _this.callbacks[e.data.command].pop();
                        if (typeof cb == 'function') {
                            cb(e.data.data);
                        }
                    };
                }

                _createClass(Recorder, [{
                    key: 'record',
                    value: function record() {
                        this.recording = true;
                    }
                }, {
                    key: 'stop',
                    value: function stop() {
                        this.recording = false;
                    }
                }, {
                    key: 'clear',
                    value: function clear() {
                        this.worker.postMessage({ command: 'clear' });
                    }
                }, {
                    key: 'getBuffer',
                    value: function getBuffer(cb) {
                        cb = cb || this.config.callback;
                        if (!cb) throw new Error('Callback not set');

                        this.callbacks.getBuffer.push(cb);

                        this.worker.postMessage({ command: 'getBuffer' });
                    }
                }, {
                    key: 'exportWAV',
                    value: function exportWAV(cb, mimeType) {
                        mimeType = mimeType || this.config.mimeType;
                        cb = cb || this.config.callback;
                        if (!cb) throw new Error('Callback not set');

                        this.callbacks.exportWAV.push(cb);

                        this.worker.postMessage({
                            command: 'exportWAV',
                            type: mimeType
                        });
                    }
                }], [{
                    key: 'forceDownload',
                    value: function forceDownload(blob, filename) {
                        var url = (window.URL || window.webkitURL).createObjectURL(blob);
                        var link = window.document.createElement('a');
                        link.href = url;
                        link.download = filename || 'output.wav';
                        var click = document.createEvent("Event");
                        click.initEvent("click", true, true);
                        link.dispatchEvent(click);
                    }
                }]);

                return Recorder;
            })();

            exports.default = Recorder;
        }, { "inline-worker": 3 }], 3: [function (require, module, exports) {
            "use strict";

            module.exports = require("./inline-worker");
        }, { "./inline-worker": 4 }], 4: [function (require, module, exports) {
            (function (global) {
                "use strict";

                var _createClass = (function () { function defineProperties(target, props) { for (var key in props) { var prop = props[key]; prop.configurable = true; if (prop.value) prop.writable = true; } Object.defineProperties(target, props); } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; })();

                var _classCallCheck = function (instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } };

                var WORKER_ENABLED = !!(global === global.window && global.URL && global.Blob && global.Worker);

                var InlineWorker = (function () {
                    function InlineWorker(func, self) {
                        var _this = this;

                        _classCallCheck(this, InlineWorker);

                        if (WORKER_ENABLED) {
                            var functionBody = func.toString().trim().match(/^function\s*\w*\s*\([\w\s,]*\)\s*{([\w\W]*?)}$/)[1];
                            var url = global.URL.createObjectURL(new global.Blob([functionBody], { type: "text/javascript" }));

                            return new global.Worker(url);
                        }

                        this.self = self;
                        this.self.postMessage = function (data) {
                            setTimeout(function () {
                                _this.onmessage({ data: data });
                            }, 0);
                        };

                        setTimeout(function () {
                            func.call(self);
                        }, 0);
                    }

                    _createClass(InlineWorker, {
                        postMessage: {
                            value: function postMessage(data) {
                                var _this = this;

                                setTimeout(function () {
                                    _this.self.onmessage({ data: data });
                                }, 0);
                            }
                        }
                    });

                    return InlineWorker;
                })();

                module.exports = InlineWorker;
            }).call(this, typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {})
        }, {}]
    }, {}, [1])(1)
});

复制上面的代码，引入页面中

前端的逻辑

这里面注意
var audio_context = new AudioContext({ sampleRate: 16000 });//音频内容对象这句代码就可以了，不要修改，与上文的Recoeder.js中的逻辑是对应的，这样采样出来的音频文件才是符合百度要求的。直接拷贝代码就能用。通过语音控制人脸识别等操作，只需要再1写一下逻辑即可，前端判断一下某些关键词是否存在，从而触发某些方法，示例代码中触发的是“手势识别”，弄了一个定时器，点击开始录音，5s后自动进行语音识别操作，更加智能一些。

<script type="text/javascript">
    var reco = null;
    // var audio_context = new AudioContext();//音频内容对象
    navigator.getUserMedia = (navigator.getUserMedia ||
        navigator.webkitGetUserMedia ||
        navigator.mozGetUserMedia ||
        navigator.msGetUserMedia); // 兼容其他浏览器

    navigator.getUserMedia({ audio: true }, create_stream, function (err) {
        console.log(err)
    });

    function create_stream(user_media) {
        //这里写死sampleRate: 16000
        var audio_context = new AudioContext({ sampleRate: 16000 });//音频内容对象
        var stream_input = audio_context.createMediaStreamSource(user_media);
        reco = new Recorder(stream_input, {
            numChannels: 1
        });

    }
    var clock = '';
    function start_reco() {
        reco.record();
        clock = setInterval(ai_reco, 5000)
        console.log("666")
    }

    function ai_reco() {
        reco.stop();
        clearInterval(clock);
        reco.exportWAV(function (wav_file) {
            console.log(wav_file);
            var formdata = new FormData(); // form 表单 {key:value}
            formdata.append("audio", wav_file); // form input type="file"

            $.ajax({
                url: "/Recorder/RecorderVoice",
                type: 'post',
                processData: false,
                contentType: false,
                data: formdata,
                dataType: 'json',
                success: function (jsonStr) {

                    var data = JSON.stringify(jsonStr);
                    if (data.search("手势识别") != -1) {
                        $("#btn_rcon").click();
                    }
                    $("#voiceText").html("语音识别结果：" + data);
                    //  document.getElementById("player").src = "/get_audio/" + data.filename;
                }
            })
        });
        reco.clear();
    }
</script>

后端逻辑

这里没啥讲的了，注意一下去官方开通相应的语音服务，填入对应的密匙，id等参数，还要开通普通话api

 public IActionResult RecorderVoice([FromForm] IFormFile audio)
        {
            string appId = _configuration["BaiduAiConfig:BaiDuLanguage:AppId_Language"];
            string apiKey = _configuration["BaiduAiConfig:BaiDuLanguage:ApiKey_Language"];
            string secertKey = _configuration["BaiduAiConfig:BaiDuLanguage:SecertKey_Language"];
            var client = new Baidu.Aip.Speech.Asr(appId, apiKey, secertKey);
            client.Timeout = 60000;  // 修改超时时间

            string filename = Path.Combine("wwwroot/files", Guid.NewGuid().ToString().Substring(0, 6) + ".wav");
            using
                (FileStream fs = System.IO.File.Create(filename))
            {
                audio.CopyTo(fs);
                fs.Flush();
            }

            FileStream filestream = new FileStream(filename, FileMode.Open);
            byte[] arr = new byte[filestream.Length];
            filestream.Read(arr, 0, (int)filestream.Length);
            filestream.Close();
            // 可选参数
            var options = new Dictionary<string, object>
            {
               {"dev_pid", 1537}
              //  {"dev_pid",1737 }
             };
            client.Timeout = 120000; // 若语音较长，建议设置更大的超时时间. ms
            var result = client.Recognize(arr, "wav", 16000, options);

            if (int.Parse(result["err_no"].ToString()) == 0 && result["err_msg"].ToString() == "success.")
            {
                return Json(result["result"][0].ToString());
            }
            return Json("Erro");
        }
    }

基于ASP.NET CORE MVC 3.1 打造百度AI智能体验平台：从人脸/手势识别到语音识别控制