将 H264 + PCMA 转为MP4格式(RTSP协议)
问题:
通过rtsp交互,解析RTP流(h264视频+alaw(pcma)音频), 将其保存为MP4格式。
解决方案:
使用MP4V2 ,版本2.0.0, VC2010可编译。(还有一种gpac库,听说支持h265,未实验)
Mp4V2下载地址: https://code.google.com/archive/p/mp4v2/downloads , 文件名mp4v2-2.0.0.tar.bz2
参考文章: 使用mp4v2将H264+AAC合成mp4文件
参考文章: 将RTSP流录制为mp4文件
参考文章: linux下利用mp4v2库将h264和aac文件封装成MP4
参考文章: Android音视频系列:视频容器操作篇 -- mp4容器打包实现
参考文章: Mp4V2调试经验记录 (视频或者音画同步的主要参数是duration)
参考文章:mp4v2 接口函数 : 其中有一些注意的地方,我似乎没有理会。
PS: 考虑过FFmpeg,但是太大,Windows下编译麻烦,不会裁剪。Mp4V2比较轻量级,适合做一个SDK。
前期:
建议先对MP4格式有一点基本了解, 看看一些MP4格式的文档,有一个基本概念。
参考文章: mp4文件格式解析 , 介绍详细,其中推荐mp4v2 和 gpac,和一个在线MP4格式解析器,
参考文章(代码): 逐个Box解析MP4文件的代码,挺好的。 MP4 file analyzer (This is a console application, written by MSVC 2008.It will display mp4 file content in human readable format.)
套路:
//创建mp4文件
MP4FileHandle file = MP4CreateEx("d:\\test.mp4", MP4_CREATE_64BIT_DATA | MP4_CREATE_64BIT_TIME);
MP4SetTimeScale(file, 90000);
MP4TrackId video = MP4AddH264VideoTrack(file, 90000, 90000/_fps, _width, _height,
0x64, //sps[1] AVCProfileIndication
0x00, //sps[2] profile_compat
0x1f, //sps[3] AVCLevelIndication
3); // 4 bytes length before each NAL unit
MP4SetVideoProfileLevel(file, 0x7F);
//源码重新编译,新加函数MP4AddALawAudioTrack2
MP4TrackId audio = MP4AddALawAudioTrack2(file,
8000, //timescale
8000*40/1000); //sampleDuration. 40ms
MP4SetTrackIntegerProperty(file,audio, "mdia.minf.stbl.stsd.alaw.channels",1);
MP4SetAudioProfileLevel(file, 0x2);
while(1)
{
//更新SPS
MP4AddH264SequenceParameterSet(file, video, (const uint8_t*)(pH264+4), len);
//更新PPS
MP4AddH264PictureParameterSet(file, video, (const uint8_t*)(pH264+4), len);
//更新H264 (前四个字节需要特殊处理)
if(key_frame)
{
MP4WriteSample(file, video, (const uint8_t* )pH264, len, MP4_INVALID_DURATION, 0,
1);
}else{
MP4WriteSample(file, video, (const uint8_t* )pH264, len, MP4_INVALID_DURATION, 0,
0);
}
//更新pcma
MP4WriteSample(file, audio, (const uint8_t* )pCMA, len , MP4_INVALID_DURATION, 0, 1);
}
MP4Close(file);
PS: 注意Write H264 Sample时,h264流中的NAL,头四个字节是0x00000001, 而mp4中的h264track,头四个字节要求是NAL的长度,并且是大端顺序,所以,写入前,需要进行如下更改:
uint32_t* pSize = (uint32_t*)pH264 ;
*pSize = htonl(len - 4);
PS: 源码中函数 AddALawAudioTrack里面计算
uint32_t fixedSampleDuration = (timeScale * 20)/1000; // 20mSec/Sample
然而实际的1秒25帧, 间隔40ms, 而不是20ms。故而需要手动添加一个新的接口。
PS: 音频和视频同步没有特别处理,没考虑。
PS: SPS,PPS 不知道是否需要每次变更都需要进行 MP4AddH264SequenceParameterSet , MP4AddH264PictureParameterSet操作。 不确定这个操作的影响。
简单实验结果:
测试步骤1 : VLC可播放, 有声音。
测试步骤2: 手机端微信可播放。
测试步骤3: 桌面端微信不可播放。 (猜测可能是音频格式需要AAC格式)
测试步骤4: Html5 可播放,有声音。
测试步骤5: Windows Media Player可播放, 但是没声音。
困扰: 发现MP4文件中的mdat 数据特别长, 不知道是否有影响, 这份生成的MP4文件不支持格式工厂转换。
其他:
感觉MP4似乎主要用来打包H264和AAC的,而我用的音频是PCMA,不知道有没有兼容性问题。
mp4格式好复杂,结构可变,又多, 里面有好多Box结构体,我曾经企图一个字节一个字节的研究,后来放弃了,复杂。
附图(图片来自网络):
参考源码: (来源: https://github.com/Thinkerfans/lib-mp4v2/tree/master/mp4v2)
lib-mp4v2/mp4v2/mp4record.h
//
// mp4record.h
// RTSP_Player
//
// Created by apple on 15/4/7.
// Copyright (c) 2015年 thinker. All rights reserved.
//
#ifndef __RTSP_Player__mp4record__
#define __RTSP_Player__mp4record__
#include "mp4v2.h"
#define _NALU_SPS_ 0
#define _NALU_PPS_ 1
#define _NALU_I_ 2
#define _NALU_P_ 3
int initMp4Encoder(const char * filename,int width,int height);
int mp4VEncode(uint8_t * data ,int len);
int mp4AEncode(uint8_t * data ,int len);
void closeMp4Encoder();
#endif /* defined(__RTSP_Player__mp4record__) */
lib-mp4v2/mp4v2/mp4record.c
//
// mp4record.c
// RTSP_Player
//
// Created by apple on 15/4/7.
// Copyright (c) 2015年 thinker. All rights reserved.
//
#include "mp4record.h"
#include <stdlib.h>
typedef struct MP4V2_CONTEXT{
int m_vWidth,m_vHeight,m_vFrateR,m_vTimeScale;
MP4FileHandle m_mp4FHandle;
MP4TrackId m_vTrackId,m_aTrackId;
double m_vFrameDur;
} MP4V2_CONTEXT;
struct MP4V2_CONTEXT * recordCtx = NULL;
int initMp4Encoder(const char * filename,int width,int height){
int ret = -1;
recordCtx = malloc(sizeof(struct MP4V2_CONTEXT));
if (!recordCtx) {
printf("error : malloc context \n");
return ret;
}
recordCtx->m_vWidth = width;
recordCtx->m_vHeight = height;
recordCtx->m_vFrateR = 25;
recordCtx->m_vTimeScale = 90000;
recordCtx->m_vFrameDur = 300;
recordCtx->m_vTrackId = 0;
recordCtx->m_aTrackId = 0;
recordCtx->m_mp4FHandle = MP4Create(filename,0);
if (recordCtx->m_mp4FHandle == MP4_INVALID_FILE_HANDLE) {
printf("error : MP4Create \n");
return ret;
}
MP4SetTimeScale(recordCtx->m_mp4FHandle, recordCtx->m_vTimeScale);
//------------------------------------------------------------------------------------- audio track
// recordCtx->m_aTrackId = MP4AddAudioTrack(recordCtx->m_mp4FHandle, 44100, 1024, MP4_MPEG4_AUDIO_TYPE);
// if (recordCtx->m_aTrackId == MP4_INVALID_TRACK_ID){
// printf("error : MP4AddAudioTrack \n");
// return ret;
// }
//
// MP4SetAudioProfileLevel(recordCtx->m_mp4FHandle, 0x2);
// uint8_t aacConfig[2] = {18,16};
// MP4SetTrackESConfiguration(recordCtx->m_mp4FHandle,recordCtx->m_aTrackId,aacConfig,2);
// printf("ok : initMp4Encoder file=%s \n",filename);
return 0;
}
int mp4VEncode(uint8_t * _naluData ,int _naluSize){
int index = -1;
if(_naluData[0]==0 && _naluData[1]==0 && _naluData[2]==0 && _naluData[3]==1 && _naluData[4]==0x67){
index = _NALU_SPS_;
}
if(index!=_NALU_SPS_ && recordCtx->m_vTrackId == MP4_INVALID_TRACK_ID){
return index;
}
if(_naluData[0]==0 && _naluData[1]==0 && _naluData[2]==0 && _naluData[3]==1 && _naluData[4]==0x68){
index = _NALU_PPS_;
}
if(_naluData[0]==0 && _naluData[1]==0 && _naluData[2]==0 && _naluData[3]==1 && _naluData[4]==0x65){
index = _NALU_I_;
}
if(_naluData[0]==0 && _naluData[1]==0 && _naluData[2]==0 && _naluData[3]==1 && _naluData[4]==0x41){
index = _NALU_P_;
}
//
switch(index){
case _NALU_SPS_:
if(recordCtx->m_vTrackId == MP4_INVALID_TRACK_ID){
recordCtx->m_vTrackId = MP4AddH264VideoTrack
(recordCtx->m_mp4FHandle,
recordCtx->m_vTimeScale,
recordCtx->m_vTimeScale / recordCtx->m_vFrateR,
recordCtx->m_vWidth, // width
recordCtx->m_vHeight, // height
_naluData[5], // sps[1] AVCProfileIndication
_naluData[6], // sps[2] profile_compat
_naluData[7], // sps[3] AVCLevelIndication
3); // 4 bytes length before each NAL unit
if (recordCtx->m_vTrackId == MP4_INVALID_TRACK_ID) {
return -1;
}
MP4SetVideoProfileLevel(recordCtx->m_mp4FHandle, 0x7F); // Simple Profile @ Level 3
}
MP4AddH264SequenceParameterSet(recordCtx->m_mp4FHandle,recordCtx->m_vTrackId,_naluData+4,_naluSize-4);
//
break;
case _NALU_PPS_:
MP4AddH264PictureParameterSet(recordCtx->m_mp4FHandle,recordCtx->m_vTrackId,_naluData+4,_naluSize-4);
break;
case _NALU_I_:
{
uint8_t * IFrameData = malloc(_naluSize+1);
//
IFrameData[0] = (_naluSize-3) >>24;
IFrameData[1] = (_naluSize-3) >>16;
IFrameData[2] = (_naluSize-3) >>8;
IFrameData[3] = (_naluSize-3) &0xff;
memcpy(IFrameData+4,_naluData+3,_naluSize-3);
// if(!MP4WriteSample(recordCtx->m_mp4FHandle, recordCtx->m_vTrackId, IFrameData, _naluSize+1, recordCtx->m_vFrameDur/44100*90000, 0, 1)){
// return -1;
// }
// recordCtx->m_vFrameDur = 0;
if(!MP4WriteSample(recordCtx->m_mp4FHandle, recordCtx->m_vTrackId, IFrameData, _naluSize+1, MP4_INVALID_DURATION, 0, 1)){
return -1;
}
free(IFrameData);
//
break;
}
case _NALU_P_:
{
_naluData[0] = (_naluSize-4) >>24;
_naluData[1] = (_naluSize-4) >>16;
_naluData[2] = (_naluSize-4) >>8;
_naluData[3] = (_naluSize-4) &0xff;
// if(!MP4WriteSample(recordCtx->m_mp4FHandle, recordCtx->m_vTrackId, _naluData, _naluSize, recordCtx->m_vFrameDur/44100*90000, 0, 1)){
// return -1;
// }
// recordCtx->m_vFrameDur = 0;
if(!MP4WriteSample(recordCtx->m_mp4FHandle, recordCtx->m_vTrackId, _naluData, _naluSize, MP4_INVALID_DURATION, 0, 1)){
return -1;
}
break;
}
}
return 0;
}
int mp4AEncode(uint8_t * data ,int len){
if(recordCtx->m_vTrackId == MP4_INVALID_TRACK_ID){
return -1;
}
MP4WriteSample(recordCtx->m_mp4FHandle, recordCtx->m_aTrackId, data, len , MP4_INVALID_DURATION, 0, 1);
recordCtx->m_vFrameDur += 1024;
return 0;
}
void closeMp4Encoder(){
if(recordCtx){
if (recordCtx->m_mp4FHandle != MP4_INVALID_FILE_HANDLE) {
MP4Close(recordCtx->m_mp4FHandle,0);
recordCtx->m_mp4FHandle = NULL;
}
free(recordCtx);
recordCtx = NULL;
}
printf("ok : closeMp4Encoder \n");
}
上一篇: Scipy之图片降噪