From 8f5d8682ef383954d46beb5fafe51c0f1bfb2897 Mon Sep 17 00:00:00 2001 From: Rafael Soares Date: Tue, 22 Dec 2020 15:28:03 +0000 Subject: [PATCH] Adds H264 depacketisation logic as discussed in #378. --- .../WebRTCExamples/WebRTCReceiver/Program.cs | 4 +- .../WebRTCReceiver/WebRTCReceiver.csproj | 1 + src/net/RTP/H264PayloadProcessor.cs | 258 ++++++++++++++++++ src/net/RTP/RTPSession.cs | 28 +- src/net/RTP/RtpVideoFramer.cs | 63 +++-- 5 files changed, 323 insertions(+), 31 deletions(-) create mode 100644 src/net/RTP/H264PayloadProcessor.cs diff --git a/examples/WebRTCExamples/WebRTCReceiver/Program.cs b/examples/WebRTCExamples/WebRTCReceiver/Program.cs index b8c90b36c..c75ab2282 100644 --- a/examples/WebRTCExamples/WebRTCReceiver/Program.cs +++ b/examples/WebRTCExamples/WebRTCReceiver/Program.cs @@ -32,6 +32,7 @@ using SIPSorcery.Sys; using SIPSorceryMedia.Abstractions.V1; using SIPSorceryMedia.Encoders; +using SIPSorceryMedia.FFmpeg; using WebSocketSharp.Server; namespace demo @@ -106,7 +107,8 @@ static void Main(string[] args) private static Task CreatePeerConnection() { - var videoEP = new SIPSorceryMedia.Windows.WindowsVideoEndPoint(new VpxVideoEncoder()); + //var videoEP = new SIPSorceryMedia.Windows.WindowsVideoEndPoint(new VpxVideoEncoder()); + var videoEP = new SIPSorceryMedia.Windows.WindowsVideoEndPoint(new FFmpegVideoEncoder()); videoEP.RestrictFormats(format => format.Codec == VideoCodecsEnum.VP8); videoEP.OnVideoSinkDecodedSample += (byte[] bmp, uint width, uint height, int stride, VideoPixelFormatsEnum pixelFormat) => diff --git a/examples/WebRTCExamples/WebRTCReceiver/WebRTCReceiver.csproj b/examples/WebRTCExamples/WebRTCReceiver/WebRTCReceiver.csproj index c3c16da5a..7b8a84686 100644 --- a/examples/WebRTCExamples/WebRTCReceiver/WebRTCReceiver.csproj +++ b/examples/WebRTCExamples/WebRTCReceiver/WebRTCReceiver.csproj @@ -19,6 +19,7 @@ + diff --git a/src/net/RTP/H264PayloadProcessor.cs b/src/net/RTP/H264PayloadProcessor.cs new file mode 100644 index 000000000..93b94e9cf --- /dev/null +++ b/src/net/RTP/H264PayloadProcessor.cs @@ -0,0 +1,258 @@ +/// +/// Based in https://github.com/BogdanovKirill/RtspClientSharp/blob/master/RtspClientSharp/MediaParsers/H264VideoPayloadParser.cs +/// Distributed under MIT License +/// +/// @author raf.csoares@kyubinteractive.com +/// + +using System; +using System.Collections.Generic; +using System.IO; + +namespace SIPSorcery.Net +{ + public class H264PayloadProcessor + { + const int SPS = 7; + const int PPS = 8; + const int IDR_SLICE = 1; + const int NON_IDR_SLICE = 5; + + //Payload Helper Fields + uint previous_timestamp = 0; + int norm, fu_a, fu_b, stap_a, stap_b, mtap16, mtap24 = 0; // used for diagnostics stats + List> temporary_rtp_payloads = new List>(); // used to assemble the RTP packets that form one RTP Frame + MemoryStream fragmented_nal = new MemoryStream(); // used to concatenate fragmented H264 NALs where NALs are splitted over RTP packets + + public virtual MemoryStream ProcessRTPPayload(byte[] rtpPayload, ushort seqNum, uint timestamp, int markbit, out bool isKeyFrame) + { + List nal_units = ProcessRTPPayloadAsNals(rtpPayload, seqNum, timestamp, markbit, out isKeyFrame); + + if (nal_units != null) + { + //Calculate total buffer size + long totalBufferSize = 0; + for (int i = 0; i < nal_units.Count; i++) + { + var nal = nal_units[i]; + long remaining = nal.Length; + + if (remaining > 0) + { + totalBufferSize += remaining + 4; //nal + 0001 + } + else + { + nal_units.RemoveAt(i); + i--; + } + } + + //Merge nals in same buffer using Annex-B separator (0001) + MemoryStream data = new MemoryStream(new byte[totalBufferSize]); + foreach (var nal in nal_units) + { + data.WriteByte(0); + data.WriteByte(0); + data.WriteByte(0); + data.WriteByte(1); + data.Write(nal, 0, nal.Length); + } + return data; + } + return null; + } + + public virtual List ProcessRTPPayloadAsNals(byte[] rtpPayload, ushort seqNum, uint timestamp, int markbit, out bool isKeyFrame) + { + List nal_units = ProcessH264Payload(rtpPayload, seqNum, timestamp, markbit, out isKeyFrame); + + return nal_units; + } + + protected virtual List ProcessH264Payload(byte[] rtp_payload, ushort seqNum, uint rtp_timestamp, int rtp_marker, out bool isKeyFrame) + { + if (previous_timestamp != rtp_timestamp && previous_timestamp > 0) + { + temporary_rtp_payloads.Clear(); + previous_timestamp = 0; + fragmented_nal.SetLength(0); + } + + // Add to the list of payloads for the current Frame of video + temporary_rtp_payloads.Add(new KeyValuePair(seqNum, rtp_payload)); // TODO could optimise this and go direct to Process Frame if just 1 packet in frame + if (rtp_marker == 1) + { + //Reorder to prevent UDP incorrect package order + if (temporary_rtp_payloads.Count > 1) + { + temporary_rtp_payloads.Sort((a, b) => { return a.Key.CompareTo(b.Key); }); + } + + // End Marker is set. Process the list of RTP Packets (forming 1 RTP frame) and save the NALs to a file + List nal_units = ProcessH264PayloadFrame(temporary_rtp_payloads, out isKeyFrame); + temporary_rtp_payloads.Clear(); + previous_timestamp = 0; + fragmented_nal.SetLength(0); + + return nal_units; + } + else + { + isKeyFrame = false; + previous_timestamp = rtp_timestamp; + return null; // we don't have a frame yet. Keep accumulating RTP packets + } + } + + // Process a RTP Frame. A RTP Frame can consist of several RTP Packets which have the same Timestamp + // Returns a list of NAL Units (with no 00 00 00 01 header and with no Size header) + protected virtual List ProcessH264PayloadFrame(List> rtp_payloads, out bool isKeyFrame) + { + bool? isKeyFrameNullable = null; + List nal_units = new List(); // Stores the NAL units for a Video Frame. May be more than one NAL unit in a video frame. + + for (int payload_index = 0; payload_index < rtp_payloads.Count; payload_index++) + { + // Examine the first rtp_payload and the first byte (the NAL header) + int nal_header_f_bit = (rtp_payloads[payload_index].Value[0] >> 7) & 0x01; + int nal_header_nri = (rtp_payloads[payload_index].Value[0] >> 5) & 0x03; + int nal_header_type = (rtp_payloads[payload_index].Value[0] >> 0) & 0x1F; + + // If the Nal Header Type is in the range 1..23 this is a normal NAL (not fragmented) + // So write the NAL to the file + if (nal_header_type >= 1 && nal_header_type <= 23) + { + norm++; + //Check if is Key Frame + CheckKeyFrame(nal_header_type, ref isKeyFrameNullable); + + nal_units.Add(rtp_payloads[payload_index].Value); + } + // There are 4 types of Aggregation Packet (split over RTP payloads) + else if (nal_header_type == 24) + { + stap_a++; + + // RTP packet contains multiple NALs, each with a 16 bit header + // Read 16 byte size + // Read NAL + try + { + int ptr = 1; // start after the nal_header_type which was '24' + // if we have at least 2 more bytes (the 16 bit size) then consume more data + while (ptr + 2 < (rtp_payloads[payload_index].Value.Length - 1)) + { + int size = (rtp_payloads[payload_index].Value[ptr] << 8) + (rtp_payloads[payload_index].Value[ptr + 1] << 0); + ptr = ptr + 2; + byte[] nal = new byte[size]; + Buffer.BlockCopy(rtp_payloads[payload_index].Value, ptr, nal, 0, size); // copy the NAL + + byte reconstructed_nal_type = (byte)((nal[0] >> 0) & 0x1F); + //Check if is Key Frame + CheckKeyFrame(reconstructed_nal_type, ref isKeyFrameNullable); + + nal_units.Add(nal); // Add to list of NALs for this RTP frame. Start Codes like 00 00 00 01 get added later + ptr = ptr + size; + } + } + catch + { + } + } + else if (nal_header_type == 25) + { + stap_b++; + } + else if (nal_header_type == 26) + { + mtap16++; + } + else if (nal_header_type == 27) + { + mtap24++; + } + else if (nal_header_type == 28) + { + fu_a++; + + // Parse Fragmentation Unit Header + int fu_indicator = rtp_payloads[payload_index].Value[0]; + int fu_header_s = (rtp_payloads[payload_index].Value[1] >> 7) & 0x01; // start marker + int fu_header_e = (rtp_payloads[payload_index].Value[1] >> 6) & 0x01; // end marker + int fu_header_r = (rtp_payloads[payload_index].Value[1] >> 5) & 0x01; // reserved. should be 0 + int fu_header_type = (rtp_payloads[payload_index].Value[1] >> 0) & 0x1F; // Original NAL unit header + + // Check Start and End flags + if (fu_header_s == 1 && fu_header_e == 0) + { + // Start of Fragment. + // Initialise the fragmented_nal byte array + // Build the NAL header with the original F and NRI flags but use the the Type field from the fu_header_type + byte reconstructed_nal_type = (byte)((nal_header_f_bit << 7) + (nal_header_nri << 5) + fu_header_type); + + // Empty the stream + fragmented_nal.SetLength(0); + + // Add reconstructed_nal_type byte to the memory stream + fragmented_nal.WriteByte((byte)reconstructed_nal_type); + + // copy the rest of the RTP payload to the memory stream + fragmented_nal.Write(rtp_payloads[payload_index].Value, 2, rtp_payloads[payload_index].Value.Length - 2); + } + + if (fu_header_s == 0 && fu_header_e == 0) + { + // Middle part of Fragment + // Append this payload to the fragmented_nal + // Data starts after the NAL Unit Type byte and the FU Header byte + fragmented_nal.Write(rtp_payloads[payload_index].Value, 2, rtp_payloads[payload_index].Value.Length - 2); + } + + if (fu_header_s == 0 && fu_header_e == 1) + { + // End part of Fragment + // Append this payload to the fragmented_nal + // Data starts after the NAL Unit Type byte and the FU Header byte + fragmented_nal.Write(rtp_payloads[payload_index].Value, 2, rtp_payloads[payload_index].Value.Length - 2); + + var fragmeted_nal_array = fragmented_nal.ToArray(); + byte reconstructed_nal_type = (byte)((fragmeted_nal_array[0] >> 0) & 0x1F); + + //Check if is Key Frame + CheckKeyFrame(reconstructed_nal_type, ref isKeyFrameNullable); + + // Add the NAL to the array of NAL units + nal_units.Add(fragmeted_nal_array); + fragmented_nal.SetLength(0); + } + } + + else if (nal_header_type == 29) + { + fu_b++; + } + } + + isKeyFrame = isKeyFrameNullable != null ? isKeyFrameNullable.Value : false; + + // Output all the NALs that form one RTP Frame (one frame of video) + return nal_units; + } + + protected void CheckKeyFrame(int nal_type, ref bool? isKeyFrame) + { + if (isKeyFrame == null) + { + isKeyFrame = nal_type == SPS || nal_type == PPS ? new bool?(true) : + (nal_type == NON_IDR_SLICE ? new bool?(false) : null); + } + else + { + isKeyFrame = nal_type == SPS || nal_type == PPS ? + (isKeyFrame.Value ? isKeyFrame : new bool?(false)) : + (nal_type == NON_IDR_SLICE ? new bool?(false) : isKeyFrame); + } + } + } +} diff --git a/src/net/RTP/RTPSession.cs b/src/net/RTP/RTPSession.cs index d915fb355..4ac9d6488 100644 --- a/src/net/RTP/RTPSession.cs +++ b/src/net/RTP/RTPSession.cs @@ -2040,14 +2040,8 @@ protected void OnReceive(int localPort, IPEndPoint remoteEndPoint, byte[] buffer VideoRemoteTrack.LastRemoteSeqNum = rtpPacket.Header.SequenceNumber; - var videoFormat = GetSendingFormat(SDPMediaTypesEnum.video); - if (videoFormat.Name() == VideoCodecsEnum.VP8.ToString()) + if (_rtpVideoFramer != null) { - if (_rtpVideoFramer == null) - { - _rtpVideoFramer = new RtpVideoFramer(VideoCodecsEnum.VP8); - } - var frame = _rtpVideoFramer.GotRtpPacket(rtpPacket); if (frame != null) { @@ -2056,7 +2050,25 @@ protected void OnReceive(int localPort, IPEndPoint remoteEndPoint, byte[] buffer } else { - logger.LogWarning($"The depacketisation logic for video codec {videoFormat.Name()} has not been implemented, PR's welcome!"); + var videoFormat = GetSendingFormat(SDPMediaTypesEnum.video); + + if (videoFormat.ToVideoFormat().Codec == VideoCodecsEnum.VP8 || + videoFormat.ToVideoFormat().Codec == VideoCodecsEnum.H264) + { + logger.LogDebug($"Video depacketisation codec set to {videoFormat.ToVideoFormat().Codec} for SSRC {rtpPacket.Header.SyncSource}."); + + _rtpVideoFramer = new RtpVideoFramer(videoFormat.ToVideoFormat().Codec); + + var frame = _rtpVideoFramer.GotRtpPacket(rtpPacket); + if (frame != null) + { + OnVideoFrameReceived?.Invoke(remoteEndPoint, rtpPacket.Header.Timestamp, frame); + } + } + else + { + logger.LogWarning($"Video depacketisation logic for codec {videoFormat.Name()} has not been implemented, PR's welcome!"); + } } } else if(mediaType == SDPMediaTypesEnum.audio && AudioRemoteTrack != null) diff --git a/src/net/RTP/RtpVideoFramer.cs b/src/net/RTP/RtpVideoFramer.cs index ed87fa33d..8ab75d222 100644 --- a/src/net/RTP/RtpVideoFramer.cs +++ b/src/net/RTP/RtpVideoFramer.cs @@ -32,15 +32,21 @@ public class RtpVideoFramer private VideoCodecsEnum _codec; private byte[] _currVideoFrame = new byte[MAX_FRAME_SIZE]; private int _currVideoFramePosn = 0; + private H264PayloadProcessor _h264Depacketiser; public RtpVideoFramer(VideoCodecsEnum codec) { - if(codec != VideoCodecsEnum.VP8) + if (!(codec == VideoCodecsEnum.VP8 || codec == VideoCodecsEnum.H264)) { - throw new NotSupportedException("The RTP video framer currently only understands VP8 encoded frames."); + throw new NotSupportedException("The RTP video framer currently only understands H264 and VP8 encoded frames."); } _codec = codec; + + if (_codec == VideoCodecsEnum.H264) + { + _h264Depacketiser = new H264PayloadProcessor(); + } } public byte[] GotRtpPacket(RTPPacket rtpPacket) @@ -50,35 +56,48 @@ public byte[] GotRtpPacket(RTPPacket rtpPacket) //var hdr = rtpPacket.Header; //logger.LogDebug($"rtp video, seqnum {hdr.SequenceNumber}, ts {hdr.Timestamp}, marker {hdr.MarkerBit}, payload {payload.Length}."); - if (_currVideoFramePosn + payload.Length >= MAX_FRAME_SIZE) + if (_codec == VideoCodecsEnum.VP8) { - // Something has gone very wrong. Clear the buffer. - _currVideoFramePosn = 0; - } + if (_currVideoFramePosn + payload.Length >= MAX_FRAME_SIZE) + { + // Something has gone very wrong. Clear the buffer. + _currVideoFramePosn = 0; + } - // New frames must have the VP8 Payload Descriptor Start bit set. - // The tracking of the current video frame position is to deal with a VP8 frame being split across multiple RTP packets - // as per https://tools.ietf.org/html/rfc7741#section-4.4. - if (_currVideoFramePosn > 0 || (payload[0] & 0x10) > 0) - { - RtpVP8Header vp8Header = RtpVP8Header.GetVP8Header(payload); + // New frames must have the VP8 Payload Descriptor Start bit set. + // The tracking of the current video frame position is to deal with a VP8 frame being split across multiple RTP packets + // as per https://tools.ietf.org/html/rfc7741#section-4.4. + if (_currVideoFramePosn > 0 || (payload[0] & 0x10) > 0) + { + RtpVP8Header vp8Header = RtpVP8Header.GetVP8Header(payload); - Buffer.BlockCopy(payload, vp8Header.Length, _currVideoFrame, _currVideoFramePosn, payload.Length - vp8Header.Length); - _currVideoFramePosn += payload.Length - vp8Header.Length; + Buffer.BlockCopy(payload, vp8Header.Length, _currVideoFrame, _currVideoFramePosn, payload.Length - vp8Header.Length); + _currVideoFramePosn += payload.Length - vp8Header.Length; - if (rtpPacket.Header.MarkerBit > 0) - { - var frame = _currVideoFrame.Take(_currVideoFramePosn).ToArray(); + if (rtpPacket.Header.MarkerBit > 0) + { + var frame = _currVideoFrame.Take(_currVideoFramePosn).ToArray(); - _currVideoFramePosn = 0; + _currVideoFramePosn = 0; - return frame; + return frame; + } + } + else + { + logger.LogWarning("Discarding RTP packet, VP8 header Start bit not set."); + //logger.LogWarning($"rtp video, seqnum {hdr.SequenceNumber}, ts {hdr.Timestamp}, marker {hdr.MarkerBit}, payload {payload.Length}."); } } - else + else if (_codec == VideoCodecsEnum.H264) { - logger.LogWarning("Discarding RTP packet, VP8 header Start bit not set."); - //logger.LogWarning($"rtp video, seqnum {hdr.SequenceNumber}, ts {hdr.Timestamp}, marker {hdr.MarkerBit}, payload {payload.Length}."); + var hdr = rtpPacket.Header; + var frameStream = _h264Depacketiser.ProcessRTPPayload(payload, hdr.SequenceNumber, hdr.Timestamp, hdr.MarkerBit, out bool isKeyFrame); + + if(frameStream != null) + { + return frameStream.ToArray(); + } } return null;