-
Notifications
You must be signed in to change notification settings - Fork 5.1k
/
Encoding.json
3099 lines (3099 loc) · 116 KB
/
Encoding.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
"schemes": [
"https"
],
"swagger": "2.0",
"info": {
"title": "Azure Media Services",
"description": "This Swagger was generated by the API Framework.",
"version": "2020-05-01"
},
"host": "management.azure.com",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"securityDefinitions": {
"azure_auth": {
"type": "oauth2",
"authorizationUrl": "https://login.microsoftonline.com/common/oauth2/authorize",
"flow": "implicit",
"description": "Azure Active Directory OAuth2 Flow",
"scopes": {
"user_impersonation": "Impersonate your user account"
}
}
},
"definitions": {
"Preset": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
}
},
"type": "object",
"required": [
"@odata.type"
],
"description": "Base type for all Presets, which define the recipe or instructions on how the input media files should be processed."
},
"Codec": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
},
"label": {
"type": "string",
"description": "An optional label for the codec. The label can be used to control muxing behavior."
}
},
"type": "object",
"required": [
"@odata.type"
],
"description": "Describes the basic properties of all codecs."
},
"Audio": {
"x-ms-discriminator-value": "#Microsoft.Media.Audio",
"allOf": [
{
"$ref": "#/definitions/Codec"
}
],
"properties": {
"channels": {
"type": "integer",
"format": "int32",
"description": "The number of channels in the audio."
},
"samplingRate": {
"type": "integer",
"format": "int32",
"description": "The sampling rate to use for encoding in hertz."
},
"bitrate": {
"type": "integer",
"format": "int32",
"description": "The bitrate, in bits per second, of the output encoded audio."
}
},
"type": "object",
"description": "Defines the common properties for all audio codecs."
},
"AacAudio": {
"x-ms-discriminator-value": "#Microsoft.Media.AacAudio",
"allOf": [
{
"$ref": "#/definitions/Audio"
}
],
"properties": {
"profile": {
"type": "string",
"enum": [
"AacLc",
"HeAacV1",
"HeAacV2"
],
"x-ms-enum": {
"name": "AacAudioProfile",
"values": [
{
"value": "AacLc",
"description": "Specifies that the output audio is to be encoded into AAC Low Complexity profile (AAC-LC)."
},
{
"value": "HeAacV1",
"description": "Specifies that the output audio is to be encoded into HE-AAC v1 profile."
},
{
"value": "HeAacV2",
"description": "Specifies that the output audio is to be encoded into HE-AAC v2 profile."
}
],
"modelAsString": true
},
"description": "The encoding profile to be used when encoding audio with AAC."
}
},
"type": "object",
"description": "Describes Advanced Audio Codec (AAC) audio encoding settings."
},
"Layer": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
},
"width": {
"type": "string",
"description": "The width of the output video for this layer. The value can be absolute (in pixels) or relative (in percentage). For example 50% means the output video has half as many pixels in width as the input."
},
"height": {
"type": "string",
"description": "The height of the output video for this layer. The value can be absolute (in pixels) or relative (in percentage). For example 50% means the output video has half as many pixels in height as the input."
},
"label": {
"type": "string",
"description": "The alphanumeric label for this layer, which can be used in multiplexing different video and audio layers, or in naming the output file."
}
},
"type": "object",
"required": [
"@odata.type"
],
"description": "The encoder can be configured to produce video and/or images (thumbnails) at different resolutions, by specifying a layer for each desired resolution. A layer represents the properties for the video or image at a resolution."
},
"H265VideoLayer": {
"x-ms-discriminator-value": "#Microsoft.Media.H265VideoLayer",
"allOf": [
{
"$ref": "#/definitions/Layer"
}
],
"properties": {
"bitrate": {
"type": "integer",
"format": "int32",
"description": "The average bitrate in bits per second at which to encode the input video when generating this layer. For example: a target bitrate of 3000Kbps or 3Mbps means this value should be 3000000 This is a required field."
},
"maxBitrate": {
"type": "integer",
"format": "int32",
"description": "The maximum bitrate (in bits per second), at which the VBV buffer should be assumed to refill. If not specified, defaults to the same value as bitrate."
},
"bFrames": {
"type": "integer",
"format": "int32",
"description": "The number of B-frames to be used when encoding this layer. If not specified, the encoder chooses an appropriate number based on the video profile and level."
},
"frameRate": {
"type": "string",
"description": "The frame rate (in frames per second) at which to encode this layer. The value can be in the form of M/N where M and N are integers (For example, 30000/1001), or in the form of a number (For example, 30, or 29.97). The encoder enforces constraints on allowed frame rates based on the profile and level. If it is not specified, the encoder will use the same frame rate as the input video."
},
"slices": {
"type": "integer",
"format": "int32",
"description": "The number of slices to be used when encoding this layer. If not specified, default is zero, which means that encoder will use a single slice for each frame."
},
"adaptiveBFrame": {
"type": "boolean",
"description": "Specifies whether or not adaptive B-frames are to be used when encoding this layer. If not specified, the encoder will turn it on whenever the video profile permits its use."
}
},
"type": "object",
"required": [
"bitrate"
],
"description": "Describes the settings to be used when encoding the input video into a desired output bitrate layer."
},
"H265Layer": {
"x-ms-discriminator-value": "#Microsoft.Media.H265Layer",
"allOf": [
{
"$ref": "#/definitions/H265VideoLayer"
}
],
"properties": {
"profile": {
"type": "string",
"enum": [
"Auto",
"Main"
],
"x-ms-enum": {
"name": "H265VideoProfile",
"values": [
{
"value": "Auto",
"description": "Tells the encoder to automatically determine the appropriate H.265 profile."
},
{
"value": "Main",
"description": "Main profile (https://x265.readthedocs.io/en/default/cli.html?highlight=profile#profile-level-tier)"
}
],
"modelAsString": true
},
"description": "We currently support Main. Default is Auto."
},
"level": {
"type": "string",
"description": "We currently support Level up to 6.2. The value can be Auto, or a number that matches the H.265 profile. If not specified, the default is Auto, which lets the encoder choose the Level that is appropriate for this layer."
},
"bufferWindow": {
"type": "string",
"format": "duration",
"description": "The VBV buffer window length. The value should be in ISO 8601 format. The value should be in the range [0.1-100] seconds. The default is 5 seconds (for example, PT5S)."
},
"referenceFrames": {
"type": "integer",
"format": "int32",
"description": "The number of reference frames to be used when encoding this layer. If not specified, the encoder determines an appropriate number based on the encoder complexity setting."
}
},
"type": "object",
"description": "Describes the settings to be used when encoding the input video into a desired output bitrate layer with the H.265 video codec."
},
"Video": {
"x-ms-discriminator-value": "#Microsoft.Media.Video",
"allOf": [
{
"$ref": "#/definitions/Codec"
}
],
"properties": {
"keyFrameInterval": {
"type": "string",
"format": "duration",
"description": "The distance between two key frames. The value should be non-zero in the range [0.5, 20] seconds, specified in ISO 8601 format. The default is 2 seconds(PT2S). Note that this setting is ignored if VideoSyncMode.Passthrough is set, where the KeyFrameInterval value will follow the input source setting."
},
"stretchMode": {
"type": "string",
"enum": [
"None",
"AutoSize",
"AutoFit"
],
"x-ms-enum": {
"name": "StretchMode",
"values": [
{
"value": "None",
"description": "Strictly respect the output resolution without considering the pixel aspect ratio or display aspect ratio of the input video."
},
{
"value": "AutoSize",
"description": "Override the output resolution, and change it to match the display aspect ratio of the input, without padding. For example, if the input is 1920x1080 and the encoding preset asks for 1280x1280, then the value in the preset is overridden, and the output will be at 1280x720, which maintains the input aspect ratio of 16:9."
},
{
"value": "AutoFit",
"description": "Pad the output (with either letterbox or pillar box) to honor the output resolution, while ensuring that the active video region in the output has the same aspect ratio as the input. For example, if the input is 1920x1080 and the encoding preset asks for 1280x1280, then the output will be at 1280x1280, which contains an inner rectangle of 1280x720 at aspect ratio of 16:9, and pillar box regions 280 pixels wide at the left and right."
}
],
"modelAsString": true
},
"description": "The resizing mode - how the input video will be resized to fit the desired output resolution(s). Default is AutoSize"
},
"syncMode": {
"type": "string",
"enum": [
"Auto",
"Passthrough",
"Cfr",
"Vfr"
],
"x-ms-enum": {
"name": "VideoSyncMode",
"values": [
{
"value": "Auto",
"description": "This is the default method. Chooses between Cfr and Vfr depending on muxer capabilities. For output format MP4, the default mode is Cfr."
},
{
"value": "Passthrough",
"description": "The presentation timestamps on frames are passed through from the input file to the output file writer. Recommended when the input source has variable frame rate, and are attempting to produce multiple layers for adaptive streaming in the output which have aligned GOP boundaries. Note: if two or more frames in the input have duplicate timestamps, then the output will also have the same behavior"
},
{
"value": "Cfr",
"description": "Input frames will be repeated and/or dropped as needed to achieve exactly the requested constant frame rate. Recommended when the output frame rate is explicitly set at a specified value"
},
{
"value": "Vfr",
"description": "Similar to the Passthrough mode, but if the input has frames that have duplicate timestamps, then only one frame is passed through to the output, and others are dropped. Recommended when the number of output frames is expected to be equal to the number of input frames. For example, the output is used to calculate a quality metric like PSNR against the input"
}
],
"modelAsString": true
},
"description": "The Video Sync Mode"
}
},
"type": "object",
"description": "Describes the basic properties for encoding the input video."
},
"H265Video": {
"x-ms-discriminator-value": "#Microsoft.Media.H265Video",
"allOf": [
{
"$ref": "#/definitions/Video"
}
],
"properties": {
"sceneChangeDetection": {
"type": "boolean",
"description": "Specifies whether or not the encoder should insert key frames at scene changes. If not specified, the default is false. This flag should be set to true only when the encoder is being configured to produce a single output video."
},
"complexity": {
"type": "string",
"enum": [
"Speed",
"Balanced",
"Quality"
],
"x-ms-enum": {
"name": "H265Complexity",
"values": [
{
"value": "Speed",
"description": "Tells the encoder to use settings that are optimized for faster encoding. Quality is sacrificed to decrease encoding time."
},
{
"value": "Balanced",
"description": "Tells the encoder to use settings that achieve a balance between speed and quality."
},
{
"value": "Quality",
"description": "Tells the encoder to use settings that are optimized to produce higher quality output at the expense of slower overall encode time."
}
],
"modelAsString": true
},
"description": "Tells the encoder how to choose its encoding settings. Quality will provide for a higher compression ratio but at a higher cost and longer compute time. Speed will produce a relatively larger file but is faster and more economical. The default value is Balanced."
},
"layers": {
"type": "array",
"items": {
"$ref": "#/definitions/H265Layer"
},
"description": "The collection of output H.265 layers to be produced by the encoder."
}
},
"type": "object",
"description": "Describes all the properties for encoding a video with the H.265 codec."
},
"TrackDescriptor": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
}
},
"type": "object",
"required": [
"@odata.type"
],
"description": "Base type for all TrackDescriptor types, which define the metadata and selection for tracks that should be processed by a Job"
},
"AudioTrackDescriptor": {
"x-ms-discriminator-value": "#Microsoft.Media.AudioTrackDescriptor",
"allOf": [
{
"$ref": "#/definitions/TrackDescriptor"
}
],
"properties": {
"channelMapping": {
"type": "string",
"enum": [
"FrontLeft",
"FrontRight",
"Center",
"LowFrequencyEffects",
"BackLeft",
"BackRight",
"StereoLeft",
"StereoRight"
],
"x-ms-enum": {
"name": "ChannelMapping",
"values": [
{
"value": "FrontLeft",
"description": "The Front Left Channel."
},
{
"value": "FrontRight",
"description": "The Front Right Channel."
},
{
"value": "Center",
"description": "The Center Channel."
},
{
"value": "LowFrequencyEffects",
"description": "Low Frequency Effects Channel. Sometimes referred to as the Subwoofer."
},
{
"value": "BackLeft",
"description": "The Back Left Channel. Sometimes referred to as the Left Surround Channel."
},
{
"value": "BackRight",
"description": "The Back Right Channel. Sometimes referred to as the Right Surround Channel."
},
{
"value": "StereoLeft",
"description": "The Left Stereo channel. Sometimes referred to as Down Mix Left."
},
{
"value": "StereoRight",
"description": "The Right Stereo channel. Sometimes referred to as Down Mix Right."
}
],
"modelAsString": true
},
"description": "Optional designation for single channel audio tracks. Can be used to combine the tracks into stereo or multi-channel audio tracks."
}
},
"type": "object",
"description": "A TrackSelection to select audio tracks."
},
"SelectAudioTrackByAttribute": {
"x-ms-discriminator-value": "#Microsoft.Media.SelectAudioTrackByAttribute",
"allOf": [
{
"$ref": "#/definitions/AudioTrackDescriptor"
}
],
"properties": {
"attribute": {
"type": "string",
"enum": [
"Bitrate",
"Language"
],
"x-ms-enum": {
"name": "TrackAttribute",
"values": [
{
"value": "Bitrate",
"description": "The bitrate of the track."
},
{
"value": "Language",
"description": "The language of the track."
}
],
"modelAsString": true
},
"description": "The TrackAttribute to filter the tracks by."
},
"filter": {
"type": "string",
"enum": [
"All",
"Top",
"Bottom",
"ValueEquals"
],
"x-ms-enum": {
"name": "AttributeFilter",
"values": [
{
"value": "All",
"description": "All tracks will be included."
},
{
"value": "Top",
"description": "The first track will be included when the attribute is sorted in descending order. Generally used to select the largest bitrate."
},
{
"value": "Bottom",
"description": "The first track will be included when the attribute is sorted in ascending order. Generally used to select the smallest bitrate."
},
{
"value": "ValueEquals",
"description": "Any tracks that have an attribute equal to the value given will be included."
}
],
"modelAsString": true
},
"description": "The type of AttributeFilter to apply to the TrackAttribute in order to select the tracks."
},
"filterValue": {
"type": "string",
"description": "The value to filter the tracks by. Only used when AttributeFilter.ValueEquals is specified for the Filter property."
}
},
"type": "object",
"required": [
"attribute",
"filter"
],
"description": "Select audio tracks from the input by specifying an attribute and an attribute filter."
},
"SelectAudioTrackById": {
"x-ms-discriminator-value": "#Microsoft.Media.SelectAudioTrackById",
"allOf": [
{
"$ref": "#/definitions/AudioTrackDescriptor"
}
],
"properties": {
"trackId": {
"type": "integer",
"format": "int64",
"description": "Track identifier to select"
}
},
"type": "object",
"required": [
"trackId"
],
"description": "Select audio tracks from the input by specifying a track identifier."
},
"InputDefinition": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
},
"includedTracks": {
"type": "array",
"items": {
"$ref": "#/definitions/TrackDescriptor"
},
"description": "The list of TrackDescriptors which define the metadata and selection of tracks in the input."
}
},
"type": "object",
"required": [
"@odata.type"
],
"description": "Base class for defining an input. Use sub classes of this class to specify tracks selections and related metadata."
},
"FromAllInputFile": {
"x-ms-discriminator-value": "#Microsoft.Media.FromAllInputFile",
"allOf": [
{
"$ref": "#/definitions/InputDefinition"
}
],
"properties": {},
"type": "object",
"description": "An InputDefinition that looks across all of the files provided to select tracks specified by the IncludedTracks property. Generally used with the AudioTrackByAttribute and VideoTrackByAttribute to allow selection of a single track across a set of input files."
},
"FromEachInputFile": {
"x-ms-discriminator-value": "#Microsoft.Media.FromEachInputFile",
"allOf": [
{
"$ref": "#/definitions/InputDefinition"
}
],
"properties": {},
"type": "object",
"description": "An InputDefinition that looks at each input file provided to select tracks specified by the IncludedTracks property. Generally used with the AudioTrackByAttribute and VideoTrackByAttribute to select tracks from each file given."
},
"InputFile": {
"x-ms-discriminator-value": "#Microsoft.Media.InputFile",
"allOf": [
{
"$ref": "#/definitions/InputDefinition"
}
],
"properties": {
"filename": {
"type": "string",
"description": "Name of the file that this input definition applies to."
}
},
"type": "object",
"description": "An InputDefinition for a single file. TrackSelections are scoped to the file specified."
},
"FaceDetectorPreset": {
"x-ms-discriminator-value": "#Microsoft.Media.FaceDetectorPreset",
"allOf": [
{
"$ref": "#/definitions/Preset"
}
],
"properties": {
"resolution": {
"type": "string",
"enum": [
"SourceResolution",
"StandardDefinition"
],
"x-ms-enum": {
"name": "AnalysisResolution",
"values": [
{
"value": "SourceResolution"
},
{
"value": "StandardDefinition"
}
],
"modelAsString": true
},
"description": "Specifies the maximum resolution at which your video is analyzed. The default behavior is \"SourceResolution,\" which will keep the input video at its original resolution when analyzed. Using \"StandardDefinition\" will resize input videos to standard definition while preserving the appropriate aspect ratio. It will only resize if the video is of higher resolution. For example, a 1920x1080 input would be scaled to 640x360 before processing. Switching to \"StandardDefinition\" will reduce the time it takes to process high resolution video. It may also reduce the cost of using this component (see https://azure.microsoft.com/en-us/pricing/details/media-services/#analytics for details). However, faces that end up being too small in the resized video may not be detected."
},
"mode": {
"type": "string",
"enum": [
"Analyze",
"Redact",
"Combined"
],
"x-ms-enum": {
"name": "FaceRedactorMode",
"values": [
{
"value": "Analyze",
"description": "Analyze mode detects faces and outputs a metadata file with the results. Allows editing of the metadata file before faces are blurred with Redact mode."
},
{
"value": "Redact",
"description": "Redact mode consumes the metadata file from Analyze mode and redacts the faces found."
},
{
"value": "Combined",
"description": "Combined mode does the Analyze and Redact steps in one pass when editing the analyzed faces is not desired."
}
],
"modelAsString": true
},
"description": "This mode provides the ability to choose between the following settings: 1) Analyze - For detection only.This mode generates a metadata JSON file marking appearances of faces throughout the video.Where possible, appearances of the same person are assigned the same ID. 2) Combined - Additionally redacts(blurs) detected faces. 3) Redact - This enables a 2-pass process, allowing for selective redaction of a subset of detected faces.It takes in the metadata file from a prior analyze pass, along with the source video, and a user-selected subset of IDs that require redaction."
},
"blurType": {
"type": "string",
"enum": [
"Box",
"Low",
"Med",
"High",
"Black"
],
"x-ms-enum": {
"name": "BlurType",
"values": [
{
"value": "Box",
"description": "Box: debug filter, bounding box only"
},
{
"value": "Low",
"description": "Low: box-car blur filter"
},
{
"value": "Med",
"description": "Med: Gaussian blur filter"
},
{
"value": "High",
"description": "High: Confuse blur filter"
},
{
"value": "Black",
"description": "Black: Black out filter"
}
],
"modelAsString": true
},
"description": "Blur type"
},
"experimentalOptions": {
"type": "object",
"additionalProperties": {
"type": "string"
},
"description": "Dictionary containing key value pairs for parameters not exposed in the preset itself"
}
},
"type": "object",
"description": "Describes all the settings to be used when analyzing a video in order to detect (and optionally redact) all the faces present."
},
"AudioAnalyzerPreset": {
"x-ms-discriminator-value": "#Microsoft.Media.AudioAnalyzerPreset",
"allOf": [
{
"$ref": "#/definitions/Preset"
}
],
"properties": {
"audioLanguage": {
"type": "string",
"description": "The language for the audio payload in the input using the BCP-47 format of 'language tag-region' (e.g: 'en-US'). If you know the language of your content, it is recommended that you specify it. The language must be specified explicitly for AudioAnalysisMode::Basic, since automatic language detection is not included in basic mode. If the language isn't specified or set to null, automatic language detection will choose the first language detected and process with the selected language for the duration of the file. It does not currently support dynamically switching between languages after the first language is detected. The automatic detection works best with audio recordings with clearly discernable speech. If automatic detection fails to find the language, transcription would fallback to 'en-US'.\" The list of supported languages is available here: https://go.microsoft.com/fwlink/?linkid=2109463"
},
"mode": {
"type": "string",
"enum": [
"Standard",
"Basic"
],
"x-ms-enum": {
"name": "AudioAnalysisMode",
"values": [
{
"value": "Standard",
"description": "Performs all operations included in the Basic mode, additionally performing language detection and speaker diarization."
},
{
"value": "Basic",
"description": "This mode performs speech-to-text transcription and generation of a VTT subtitle/caption file. The output of this mode includes an Insights JSON file including only the keywords, transcription,and timing information. Automatic language detection and speaker diarization are not included in this mode."
}
],
"modelAsString": true
},
"description": "Determines the set of audio analysis operations to be performed. If unspecified, the Standard AudioAnalysisMode would be chosen."
},
"experimentalOptions": {
"type": "object",
"additionalProperties": {
"type": "string"
},
"description": "Dictionary containing key value pairs for parameters not exposed in the preset itself"
}
},
"type": "object",
"description": "The Audio Analyzer preset applies a pre-defined set of AI-based analysis operations, including speech transcription. Currently, the preset supports processing of content with a single audio track."
},
"Overlay": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
},
"inputLabel": {
"type": "string",
"description": "The label of the job input which is to be used as an overlay. The Input must specify exactly one file. You can specify an image file in JPG, PNG, GIF or BMP format, or an audio file (such as a WAV, MP3, WMA or M4A file), or a video file. See https://aka.ms/mesformats for the complete list of supported audio and video file formats."
},
"start": {
"type": "string",
"format": "duration",
"description": "The start position, with reference to the input video, at which the overlay starts. The value should be in ISO 8601 format. For example, PT05S to start the overlay at 5 seconds into the input video. If not specified the overlay starts from the beginning of the input video."
},
"end": {
"type": "string",
"format": "duration",
"description": "The end position, with reference to the input video, at which the overlay ends. The value should be in ISO 8601 format. For example, PT30S to end the overlay at 30 seconds into the input video. If not specified or the value is greater than the input video duration, the overlay will be applied until the end of the input video if the overlay media duration is greater than the input video duration, else the overlay will last as long as the overlay media duration."
},
"fadeInDuration": {
"type": "string",
"format": "duration",
"description": "The duration over which the overlay fades in onto the input video. The value should be in ISO 8601 duration format. If not specified the default behavior is to have no fade in (same as PT0S)."
},
"fadeOutDuration": {
"type": "string",
"format": "duration",
"description": "The duration over which the overlay fades out of the input video. The value should be in ISO 8601 duration format. If not specified the default behavior is to have no fade out (same as PT0S)."
},
"audioGainLevel": {
"type": "number",
"format": "double",
"description": "The gain level of audio in the overlay. The value should be in the range [0, 1.0]. The default is 1.0."
}
},
"type": "object",
"required": [
"@odata.type",
"inputLabel"
],
"description": "Base type for all overlays - image, audio or video."
},
"AudioOverlay": {
"x-ms-discriminator-value": "#Microsoft.Media.AudioOverlay",
"allOf": [
{
"$ref": "#/definitions/Overlay"
}
],
"properties": {},
"type": "object",
"description": "Describes the properties of an audio overlay."
},
"CopyVideo": {
"x-ms-discriminator-value": "#Microsoft.Media.CopyVideo",
"allOf": [
{
"$ref": "#/definitions/Codec"
}
],
"properties": {},
"type": "object",
"description": "A codec flag, which tells the encoder to copy the input video bitstream without re-encoding."
},
"Image": {
"x-ms-discriminator-value": "#Microsoft.Media.Image",
"allOf": [
{
"$ref": "#/definitions/Video"
}
],
"properties": {
"start": {
"type": "string",
"description": "The position in the input video from where to start generating thumbnails. The value can be in ISO 8601 format (For example, PT05S to start at 5 seconds), or a frame count (For example, 10 to start at the 10th frame), or a relative value to stream duration (For example, 10% to start at 10% of stream duration). Also supports a macro {Best}, which tells the encoder to select the best thumbnail from the first few seconds of the video and will only produce one thumbnail, no matter what other settings are for Step and Range. The default value is macro {Best}."
},
"step": {
"type": "string",
"description": "The intervals at which thumbnails are generated. The value can be in ISO 8601 format (For example, PT05S for one image every 5 seconds), or a frame count (For example, 30 for one image every 30 frames), or a relative value to stream duration (For example, 10% for one image every 10% of stream duration). Note: Step value will affect the first generated thumbnail, which may not be exactly the one specified at transform preset start time. This is due to the encoder, which tries to select the best thumbnail between start time and Step position from start time as the first output. As the default value is 10%, it means if stream has long duration, the first generated thumbnail might be far away from the one specified at start time. Try to select reasonable value for Step if the first thumbnail is expected close to start time, or set Range value at 1 if only one thumbnail is needed at start time."
},
"range": {
"type": "string",
"description": "The position relative to transform preset start time in the input video at which to stop generating thumbnails. The value can be in ISO 8601 format (For example, PT5M30S to stop at 5 minutes and 30 seconds from start time), or a frame count (For example, 300 to stop at the 300th frame from the frame at start time. If this value is 1, it means only producing one thumbnail at start time), or a relative value to the stream duration (For example, 50% to stop at half of stream duration from start time). The default value is 100%, which means to stop at the end of the stream."
}
},
"type": "object",
"required": [
"start"
],
"description": "Describes the basic properties for generating thumbnails from the input video"
},
"Format": {
"discriminator": "@odata.type",
"properties": {
"@odata.type": {
"type": "string",
"description": "The discriminator for derived types."
},
"filenamePattern": {
"type": "string",
"description": "The pattern of the file names for the generated output files. The following macros are supported in the file name: {Basename} - An expansion macro that will use the name of the input video file. If the base name(the file suffix is not included) of the input video file is less than 32 characters long, the base name of input video files will be used. If the length of base name of the input video file exceeds 32 characters, the base name is truncated to the first 32 characters in total length. {Extension} - The appropriate extension for this format. {Label} - The label assigned to the codec/layer. {Index} - A unique index for thumbnails. Only applicable to thumbnails. {Bitrate} - The audio/video bitrate. Not applicable to thumbnails. {Codec} - The type of the audio/video codec. {Resolution} - The video resolution. Any unsubstituted macros will be collapsed and removed from the filename."
}
},
"type": "object",
"required": [
"@odata.type",
"filenamePattern"
],
"description": "Base class for output."
},
"ImageFormat": {
"x-ms-discriminator-value": "#Microsoft.Media.ImageFormat",
"allOf": [
{
"$ref": "#/definitions/Format"
}
],
"properties": {},
"type": "object",
"description": "Describes the properties for an output image file."
},
"JpgFormat": {
"x-ms-discriminator-value": "#Microsoft.Media.JpgFormat",
"allOf": [
{
"$ref": "#/definitions/ImageFormat"
}
],
"properties": {},
"type": "object",
"description": "Describes the settings for producing JPEG thumbnails."
},
"PngFormat": {
"x-ms-discriminator-value": "#Microsoft.Media.PngFormat",
"allOf": [
{
"$ref": "#/definitions/ImageFormat"
}
],
"properties": {},
"type": "object",
"description": "Describes the settings for producing PNG thumbnails."
},
"CopyAudio": {
"x-ms-discriminator-value": "#Microsoft.Media.CopyAudio",
"allOf": [
{
"$ref": "#/definitions/Codec"
}
],
"properties": {},
"type": "object",
"description": "A codec flag, which tells the encoder to copy the input audio bitstream."
},
"Deinterlace": {
"properties": {
"parity": {
"type": "string",
"enum": [
"Auto",
"TopFieldFirst",
"BottomFieldFirst"
],
"x-ms-enum": {
"name": "DeinterlaceParity",
"values": [
{
"value": "Auto",
"description": "Automatically detect the order of fields"
},
{
"value": "TopFieldFirst",
"description": "Apply top field first processing of input video."
},
{
"value": "BottomFieldFirst",
"description": "Apply bottom field first processing of input video."
}
],
"modelAsString": true
},
"description": "The field parity for de-interlacing, defaults to Auto."
},
"mode": {
"type": "string",
"enum": [
"Off",
"AutoPixelAdaptive"
],
"x-ms-enum": {
"name": "DeinterlaceMode",
"values": [
{
"value": "Off",
"description": "Disables de-interlacing of the source video."
},
{
"value": "AutoPixelAdaptive",
"description": "Apply automatic pixel adaptive de-interlacing on each frame in the input video."
}
],
"modelAsString": true
},
"description": "The deinterlacing mode. Defaults to AutoPixelAdaptive."
}
},
"type": "object",
"description": "Describes the de-interlacing settings."
},
"Rectangle": {
"properties": {
"left": {
"type": "string",
"description": "The number of pixels from the left-margin. This can be absolute pixel value (e.g 100), or relative to the size of the video (For example, 50%)."
},
"top": {
"type": "string",
"description": "The number of pixels from the top-margin. This can be absolute pixel value (e.g 100), or relative to the size of the video (For example, 50%)."
},
"width": {
"type": "string",
"description": "The width of the rectangular region in pixels. This can be absolute pixel value (e.g 100), or relative to the size of the video (For example, 50%)."
},
"height": {
"type": "string",
"description": "The height of the rectangular region in pixels. This can be absolute pixel value (e.g 100), or relative to the size of the video (For example, 50%)."
}
},
"type": "object",
"description": "Describes the properties of a rectangular window applied to the input media before processing it."
},
"Filters": {
"properties": {
"deinterlace": {
"$ref": "#/definitions/Deinterlace",
"description": "The de-interlacing settings."
},
"rotation": {
"type": "string",
"enum": [
"Auto",
"None",
"Rotate0",
"Rotate90",
"Rotate180",
"Rotate270"
],
"x-ms-enum": {
"name": "Rotation",