From 33152706d9e777bc2bf6fb6e0887dc2274a8cebb Mon Sep 17 00:00:00 2001 From: Felix LeClair Date: Thu, 14 Jan 2021 15:35:50 -0500 Subject: [PATCH] new version of vf scale cuda is a major bodge- but should be able to do reinhard tonemapping copy of email to ffmpeg-devel mailing list follows: Hey everyone! Trying to wrap my mind around how to deal with cuda HW frames and how to implement them. The goal of this filter once completed will be to take in a cuda frame, tonemap the value to a given specification using a user requested algorithm (mobius, hable reinhard clip etc.) This is useful because it completes (should) outperform cpu based tonemapping by multiple 1-3 orders of magnitude depending on the gpu used for the filter. I've based the attached filter off of the vf_scale_cuda.cu filter. For ease of developement, I've kept everything the same including the name of the filter, only changing the funtion within the file. This is very much a bodge to facilitate development. As such, for testing, this file should replace the vf_scale_cuda.cu file in ffmpeg/libavfilter/vf_scale_cuda.cu FFmpeg should then be compiled as standard for cuda filters and should be called as you would call the standard vf_scale_cuda filter. The command would be similar to: ffmpeg -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 -vf scale_cuda=Source_width:Source_Height -c:a copy -c:v h264_nvenc -b:v 5M output.mp4 The above should decode in hardware, tonemap the frame on gpu and re-encode in hardware at a given bitrate. will be in the freenode soon after sending this email (going to put on another cup of coffee ) Thanks, FelixCLC (felix__) Caviat: Like all HW filters, how effective this is will depend on how much overhead is faced by doing a memcpy over the pcie bus to the gpu itself, then passing the data back once processed. --- cuda_filter/filter_example.c | 445 ++++++++++++++++++++++++++++++++ cuda_filter/vf_copy.c | 95 +++++++ cuda_filter/vf_overlay_cuda.cu | 19 +- cuda_filter/vf_scale_cuda.cu | 222 ++++------------ cuda_filter/vf_tonemap_cuda.txt | 331 ++++++++++++++++++++++++ 5 files changed, 932 insertions(+), 180 deletions(-) create mode 100644 cuda_filter/filter_example.c create mode 100644 cuda_filter/vf_copy.c create mode 100644 cuda_filter/vf_tonemap_cuda.txt diff --git a/cuda_filter/filter_example.c b/cuda_filter/filter_example.c new file mode 100644 index 0000000..df8afbd --- /dev/null +++ b/cuda_filter/filter_example.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2012-2014 Clément Bœsch + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Edge detection filter + * + * @see https://en.wikipedia.org/wiki/Canny_edge_detector + */ + +#include "libavutil/avassert.h" +#include "libavutil/imgutils.h" +#include "libavutil/opt.h" +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +#define PLANE_R 0x4 +#define PLANE_G 0x1 +#define PLANE_B 0x2 +#define PLANE_Y 0x1 +#define PLANE_U 0x2 +#define PLANE_V 0x4 +#define PLANE_A 0x8 + +enum FilterMode { + MODE_WIRES, + MODE_COLORMIX, + MODE_CANNY, + NB_MODE +}; + +struct plane_info { + uint8_t *tmpbuf; + uint16_t *gradients; + char *directions; + int width, height; +}; + +typedef struct EdgeDetectContext { + const AVClass *class; + struct plane_info planes[3]; + int filter_planes; + int nb_planes; + double low, high; + uint8_t low_u8, high_u8; + int mode; +} EdgeDetectContext; + +#define OFFSET(x) offsetof(EdgeDetectContext, x) +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM +static const AVOption edgedetect_options[] = { + { "high", "set high threshold", OFFSET(high), AV_OPT_TYPE_DOUBLE, {.dbl=50/255.}, 0, 1, FLAGS }, + { "low", "set low threshold", OFFSET(low), AV_OPT_TYPE_DOUBLE, {.dbl=20/255.}, 0, 1, FLAGS }, + { "mode", "set mode", OFFSET(mode), AV_OPT_TYPE_INT, {.i64=MODE_WIRES}, 0, NB_MODE-1, FLAGS, "mode" }, + { "wires", "white/gray wires on black", 0, AV_OPT_TYPE_CONST, {.i64=MODE_WIRES}, INT_MIN, INT_MAX, FLAGS, "mode" }, + { "colormix", "mix colors", 0, AV_OPT_TYPE_CONST, {.i64=MODE_COLORMIX}, INT_MIN, INT_MAX, FLAGS, "mode" }, + { "canny", "detect edges on planes", 0, AV_OPT_TYPE_CONST, {.i64=MODE_CANNY}, INT_MIN, INT_MAX, FLAGS, "mode" }, + { "planes", "set planes to filter", OFFSET(filter_planes), AV_OPT_TYPE_FLAGS, {.i64=7}, 1, 0x7, FLAGS, "flags" }, + { "y", "filter luma plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_Y}, 0, 0, FLAGS, "flags" }, + { "u", "filter u plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_U}, 0, 0, FLAGS, "flags" }, + { "v", "filter v plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_V}, 0, 0, FLAGS, "flags" }, + { "r", "filter red plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_R}, 0, 0, FLAGS, "flags" }, + { "g", "filter green plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_G}, 0, 0, FLAGS, "flags" }, + { "b", "filter blue plane", 0, AV_OPT_TYPE_CONST, {.i64=PLANE_B}, 0, 0, FLAGS, "flags" }, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(edgedetect); + +static av_cold int init(AVFilterContext *ctx) +{ + EdgeDetectContext *edgedetect = ctx->priv; + + edgedetect->low_u8 = edgedetect->low * 255. + .5; + edgedetect->high_u8 = edgedetect->high * 255. + .5; + return 0; +} + +static int query_formats(AVFilterContext *ctx) +{ + const EdgeDetectContext *edgedetect = ctx->priv; + static const enum AVPixelFormat wires_pix_fmts[] = {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE}; + static const enum AVPixelFormat canny_pix_fmts[] = {AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE}; + static const enum AVPixelFormat colormix_pix_fmts[] = {AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE}; + AVFilterFormats *fmts_list; + const enum AVPixelFormat *pix_fmts = NULL; + + if (edgedetect->mode == MODE_WIRES) { + pix_fmts = wires_pix_fmts; + } else if (edgedetect->mode == MODE_COLORMIX) { + pix_fmts = colormix_pix_fmts; + } else if (edgedetect->mode == MODE_CANNY) { + pix_fmts = canny_pix_fmts; + } else { + av_assert0(0); + } + fmts_list = ff_make_format_list(pix_fmts); + if (!fmts_list) + return AVERROR(ENOMEM); + return ff_set_common_formats(ctx, fmts_list); +} + +static int config_props(AVFilterLink *inlink) +{ + int p; + AVFilterContext *ctx = inlink->dst; + EdgeDetectContext *edgedetect = ctx->priv; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); + + edgedetect->nb_planes = inlink->format == AV_PIX_FMT_GRAY8 ? 1 : 3; + for (p = 0; p < edgedetect->nb_planes; p++) { + struct plane_info *plane = &edgedetect->planes[p]; + int vsub = p ? desc->log2_chroma_h : 0; + int hsub = p ? desc->log2_chroma_w : 0; + + plane->width = AV_CEIL_RSHIFT(inlink->w, hsub); + plane->height = AV_CEIL_RSHIFT(inlink->h, vsub); + plane->tmpbuf = av_malloc(plane->width * plane->height); + plane->gradients = av_calloc(plane->width * plane->height, sizeof(*plane->gradients)); + plane->directions = av_malloc(plane->width * plane->height); + if (!plane->tmpbuf || !plane->gradients || !plane->directions) + return AVERROR(ENOMEM); + } + return 0; +} + +static void gaussian_blur(AVFilterContext *ctx, int w, int h, + uint8_t *dst, int dst_linesize, + const uint8_t *src, int src_linesize) +{ + int i, j; + + memcpy(dst, src, w); dst += dst_linesize; src += src_linesize; + if (h > 1) { + memcpy(dst, src, w); dst += dst_linesize; src += src_linesize; + } + for (j = 2; j < h - 2; j++) { + dst[0] = src[0]; + if (w > 1) + dst[1] = src[1]; + for (i = 2; i < w - 2; i++) { + /* Gaussian mask of size 5x5 with sigma = 1.4 */ + dst[i] = ((src[-2*src_linesize + i-2] + src[2*src_linesize + i-2]) * 2 + + (src[-2*src_linesize + i-1] + src[2*src_linesize + i-1]) * 4 + + (src[-2*src_linesize + i ] + src[2*src_linesize + i ]) * 5 + + (src[-2*src_linesize + i+1] + src[2*src_linesize + i+1]) * 4 + + (src[-2*src_linesize + i+2] + src[2*src_linesize + i+2]) * 2 + + + (src[ -src_linesize + i-2] + src[ src_linesize + i-2]) * 4 + + (src[ -src_linesize + i-1] + src[ src_linesize + i-1]) * 9 + + (src[ -src_linesize + i ] + src[ src_linesize + i ]) * 12 + + (src[ -src_linesize + i+1] + src[ src_linesize + i+1]) * 9 + + (src[ -src_linesize + i+2] + src[ src_linesize + i+2]) * 4 + + + src[i-2] * 5 + + src[i-1] * 12 + + src[i ] * 15 + + src[i+1] * 12 + + src[i+2] * 5) / 159; + } + if (w > 2) + dst[i ] = src[i ]; + if (w > 3) + dst[i + 1] = src[i + 1]; + + dst += dst_linesize; + src += src_linesize; + } + if (h > 2) { + memcpy(dst, src, w); dst += dst_linesize; src += src_linesize; + } + if (h > 3) + memcpy(dst, src, w); +} + +enum { + DIRECTION_45UP, + DIRECTION_45DOWN, + DIRECTION_HORIZONTAL, + DIRECTION_VERTICAL, +}; + +static int get_rounded_direction(int gx, int gy) +{ + /* reference angles: + * tan( pi/8) = sqrt(2)-1 + * tan(3pi/8) = sqrt(2)+1 + * Gy/Gx is the tangent of the angle (theta), so Gy/Gx is compared against + * , or more simply Gy against *Gx + * + * Gx and Gy bounds = [-1020;1020], using 16-bit arithmetic: + * round((sqrt(2)-1) * (1<<16)) = 27146 + * round((sqrt(2)+1) * (1<<16)) = 158218 + */ + if (gx) { + int tanpi8gx, tan3pi8gx; + + if (gx < 0) + gx = -gx, gy = -gy; + gy *= (1 << 16); + tanpi8gx = 27146 * gx; + tan3pi8gx = 158218 * gx; + if (gy > -tan3pi8gx && gy < -tanpi8gx) return DIRECTION_45UP; + if (gy > -tanpi8gx && gy < tanpi8gx) return DIRECTION_HORIZONTAL; + if (gy > tanpi8gx && gy < tan3pi8gx) return DIRECTION_45DOWN; + } + return DIRECTION_VERTICAL; +} + +static void sobel(int w, int h, + uint16_t *dst, int dst_linesize, + int8_t *dir, int dir_linesize, + const uint8_t *src, int src_linesize) +{ + int i, j; + + for (j = 1; j < h - 1; j++) { + dst += dst_linesize; + dir += dir_linesize; + src += src_linesize; + for (i = 1; i < w - 1; i++) { + const int gx = + -1*src[-src_linesize + i-1] + 1*src[-src_linesize + i+1] + -2*src[ i-1] + 2*src[ i+1] + -1*src[ src_linesize + i-1] + 1*src[ src_linesize + i+1]; + const int gy = + -1*src[-src_linesize + i-1] + 1*src[ src_linesize + i-1] + -2*src[-src_linesize + i ] + 2*src[ src_linesize + i ] + -1*src[-src_linesize + i+1] + 1*src[ src_linesize + i+1]; + + dst[i] = FFABS(gx) + FFABS(gy); + dir[i] = get_rounded_direction(gx, gy); + } + } +} + +static void non_maximum_suppression(int w, int h, + uint8_t *dst, int dst_linesize, + const int8_t *dir, int dir_linesize, + const uint16_t *src, int src_linesize) +{ + int i, j; + +#define COPY_MAXIMA(ay, ax, by, bx) do { \ + if (src[i] > src[(ay)*src_linesize + i+(ax)] && \ + src[i] > src[(by)*src_linesize + i+(bx)]) \ + dst[i] = av_clip_uint8(src[i]); \ +} while (0) + + for (j = 1; j < h - 1; j++) { + dst += dst_linesize; + dir += dir_linesize; + src += src_linesize; + for (i = 1; i < w - 1; i++) { + switch (dir[i]) { + case DIRECTION_45UP: COPY_MAXIMA( 1, -1, -1, 1); break; + case DIRECTION_45DOWN: COPY_MAXIMA(-1, -1, 1, 1); break; + case DIRECTION_HORIZONTAL: COPY_MAXIMA( 0, -1, 0, 1); break; + case DIRECTION_VERTICAL: COPY_MAXIMA(-1, 0, 1, 0); break; + } + } + } +} + +static void double_threshold(int low, int high, int w, int h, + uint8_t *dst, int dst_linesize, + const uint8_t *src, int src_linesize) +{ + int i, j; + + for (j = 0; j < h; j++) { + for (i = 0; i < w; i++) { + if (src[i] > high) { + dst[i] = src[i]; + continue; + } + + if (!(!i || i == w - 1 || !j || j == h - 1) && + src[i] > low && + (src[-src_linesize + i-1] > high || + src[-src_linesize + i ] > high || + src[-src_linesize + i+1] > high || + src[ i-1] > high || + src[ i+1] > high || + src[ src_linesize + i-1] > high || + src[ src_linesize + i ] > high || + src[ src_linesize + i+1] > high)) + dst[i] = src[i]; + else + dst[i] = 0; + } + dst += dst_linesize; + src += src_linesize; + } +} + +static void color_mix(int w, int h, + uint8_t *dst, int dst_linesize, + const uint8_t *src, int src_linesize) +{ + int i, j; + + for (j = 0; j < h; j++) { + for (i = 0; i < w; i++) + dst[i] = (dst[i] + src[i]) >> 1; + dst += dst_linesize; + src += src_linesize; + } +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + AVFilterContext *ctx = inlink->dst; + EdgeDetectContext *edgedetect = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + int p, direct = 0; + AVFrame *out; + + if (edgedetect->mode != MODE_COLORMIX && av_frame_is_writable(in)) { + direct = 1; + out = in; + } else { + out = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!out) { + av_frame_free(&in); + return AVERROR(ENOMEM); + } + av_frame_copy_props(out, in); + } + + for (p = 0; p < edgedetect->nb_planes; p++) { + struct plane_info *plane = &edgedetect->planes[p]; + uint8_t *tmpbuf = plane->tmpbuf; + uint16_t *gradients = plane->gradients; + int8_t *directions = plane->directions; + const int width = plane->width; + const int height = plane->height; + + if (!((1 << p) & edgedetect->filter_planes)) { + if (!direct) + av_image_copy_plane(out->data[p], out->linesize[p], + in->data[p], in->linesize[p], + width, height); + continue; + } + + /* gaussian filter to reduce noise */ + gaussian_blur(ctx, width, height, + tmpbuf, width, + in->data[p], in->linesize[p]); + + /* compute the 16-bits gradients and directions for the next step */ + sobel(width, height, + gradients, width, + directions,width, + tmpbuf, width); + + /* non_maximum_suppression() will actually keep & clip what's necessary and + * ignore the rest, so we need a clean output buffer */ + memset(tmpbuf, 0, width * height); + non_maximum_suppression(width, height, + tmpbuf, width, + directions,width, + gradients, width); + + /* keep high values, or low values surrounded by high values */ + double_threshold(edgedetect->low_u8, edgedetect->high_u8, + width, height, + out->data[p], out->linesize[p], + tmpbuf, width); + + if (edgedetect->mode == MODE_COLORMIX) { + color_mix(width, height, + out->data[p], out->linesize[p], + in->data[p], in->linesize[p]); + } + } + + if (!direct) + av_frame_free(&in); + return ff_filter_frame(outlink, out); +} + +static av_cold void uninit(AVFilterContext *ctx) +{ + int p; + EdgeDetectContext *edgedetect = ctx->priv; + + for (p = 0; p < edgedetect->nb_planes; p++) { + struct plane_info *plane = &edgedetect->planes[p]; + av_freep(&plane->tmpbuf); + av_freep(&plane->gradients); + av_freep(&plane->directions); + } +} + +static const AVFilterPad edgedetect_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = config_props, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad edgedetect_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +AVFilter ff_vf_edgedetect = { + .name = "edgedetect", + .description = NULL_IF_CONFIG_SMALL("Detect and draw edge."), + .priv_size = sizeof(EdgeDetectContext), + .init = init, + .uninit = uninit, + .query_formats = query_formats, + .inputs = edgedetect_inputs, + .outputs = edgedetect_outputs, + .priv_class = &edgedetect_class, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, +}; diff --git a/cuda_filter/vf_copy.c b/cuda_filter/vf_copy.c new file mode 100644 index 0000000..e82feb4 --- /dev/null +++ b/cuda_filter/vf_copy.c @@ -0,0 +1,95 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * copy video filter + */ + +#include "libavutil/imgutils.h" +#include "libavutil/internal.h" +#include "avfilter.h" +#include "internal.h" +#include "video.h" + +static int query_formats(AVFilterContext *ctx) +{ + AVFilterFormats *formats = NULL; + int fmt; + + for (fmt = 0; av_pix_fmt_desc_get(fmt); fmt++) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + int ret; + if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) + continue; + if ((ret = ff_add_format(&formats, fmt)) < 0) + return ret; + } + + return ff_set_common_formats(ctx, formats); +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + AVFilterLink *outlink = inlink->dst->outputs[0]; + AVFrame *out = ff_get_video_buffer(outlink, in->width, in->height); + int ret; + + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = av_frame_copy_props(out, in); + if (ret < 0) + goto fail; + ret = av_frame_copy(out, in); + if (ret < 0) + goto fail; + av_frame_free(&in); + return ff_filter_frame(outlink, out); +fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +static const AVFilterPad avfilter_vf_copy_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad avfilter_vf_copy_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +AVFilter ff_vf_copy = { + .name = "copy", + .description = NULL_IF_CONFIG_SMALL("Copy the input video unchanged to the output."), + .inputs = avfilter_vf_copy_inputs, + .outputs = avfilter_vf_copy_outputs, + .query_formats = query_formats, +}; diff --git a/cuda_filter/vf_overlay_cuda.cu b/cuda_filter/vf_overlay_cuda.cu index 43ec36c..60a95b2 100644 --- a/cuda_filter/vf_overlay_cuda.cu +++ b/cuda_filter/vf_overlay_cuda.cu @@ -20,14 +20,11 @@ extern "C" { -__global__ void Overlay_Cuda( - int x_position, int y_position, - unsigned char* main, int main_linesize, - unsigned char* overlay, int overlay_linesize, - int overlay_w, int overlay_h, - unsigned char* overlay_alpha, int alpha_linesize, - int alpha_adj_x, int alpha_adj_y) +__global__ void Overlay_Cuda(int x_position, int y_position, unsigned char* main, int main_linesize, unsigned char* overlay, int overlay_linesize, int overlay_w, int overlay_h, unsigned char* overlay_alpha, int alpha_linesize, int alpha_adj_x, int alpha_adj_y) { + + + int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -37,7 +34,7 @@ __global__ void Overlay_Cuda( y < y_position ) { return; - } + }//if statement close int overlay_x = x - x_position; int overlay_y = y - y_position; @@ -45,10 +42,10 @@ __global__ void Overlay_Cuda( float alpha = 1.0; if (alpha_linesize) { alpha = overlay_alpha[alpha_adj_x * overlay_x + alpha_adj_y * overlay_y * alpha_linesize] / 255.0f; - } + } if statement close main[x + y*main_linesize] = alpha * overlay[overlay_x + overlay_y * overlay_linesize] + (1.0f - alpha) * main[x + y*main_linesize]; -} +}// cuda funtion -} +}//whole funtion diff --git a/cuda_filter/vf_scale_cuda.cu b/cuda_filter/vf_scale_cuda.cu index 3f3f405..b3878e7 100644 --- a/cuda_filter/vf_scale_cuda.cu +++ b/cuda_filter/vf_scale_cuda.cu @@ -1,5 +1,7 @@ /* - * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * original source Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * Change to tonemap style filter copyright Felix LeClair * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,192 +22,74 @@ * DEALINGS IN THE SOFTWARE. */ -extern "C" { +/* -__global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex, - unsigned char *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +Warning: this is a VERY early alpha of a cuda accelerated filter to tonemap. Please see ffmpeg devel mailing list for message of title [vf_tonemap_cuda] VERY alpha ground work- implemented as cuda frame +sent on the 14th of January 2021 +It's poorly written and documented. this should not be merged under any circumstance in it's present form. - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(uchar_tex, xi-dx, yi-dy); - int y1 = tex2D(uchar_tex, xi+dx, yi-dy); - int y2 = tex2D(uchar_tex, xi-dx, yi+dy); - int y3 = tex2D(uchar_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); - } -} -__global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex, - uchar2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +*/ - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy); - uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy); - uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy); - uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y); - } -} -__global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex, - uchar4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +#include "cuda/vector_helpers.cuh" - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy); - uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy); - uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy); - uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_uchar4( - (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w); - } -} +template +__device__ inline void Subsample_Nearest(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) +/* +tex is the cuda texture +T is a pointer to the destination frame +dst_width is the width of the output frame +dst_height is the height of the output frame +dst_pitch is the I DON'T KNOW YET, but I suspect this has to do when changing the size of pixels when shifting aspect ratios. + as such I'm going to redifine as 1 so I don't have any issues +bit_depth is the amount of bits per colour channel +*/ -__global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex, - unsigned short *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) { + + dst_pitch =1;// this is a bodge, but won't be needed when I change the rest of the source to not need to deal with the legacy scalling source code. int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; if (yo < dst_height && xo < dst_width) { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(ushort_tex, xi-dx, yi-dy); - int y1 = tex2D(ushort_tex, xi+dx, yi-dy); - int y2 = tex2D(ushort_tex, xi-dx, yi+dy); - int y3 = tex2D(ushort_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); + float hscale = (float)src_width / (float)dst_width;// supposed to be the scalling factor in the original funtion, but I'm going to ignore it + float vscale = (float)src_height / (float)dst_height; // as above, going to ignore it + float xi = (xo + 0.5f); // * hscale; + float yi = (yo + 0.5f); // * vscale; + float val_IN = tex2D(tex, xi, yi);// to start I'm doing reinhard because it's idiot proof + float out = val_IN*(val_IN/(val_IN + 1.0f)); // this scales the incoming pixel by a factor of x/(x+1). this guarentees a value between 0 and 1. far from the best algortihm, but is fit for purpose + dst[yo*dst_pitch+xo] =out; // this is where I'm transforming the value to the tonemapped value. } } -__global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex, - ushort2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy); - ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy); - ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy); - ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y); - } -} -__global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex, - ushort4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +extern "C" { - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy); - ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy); - ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy); - ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_ushort4( - (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w); +#define NEAREST_KERNEL(T) \ + __global__ void Subsample_Nearest_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + //call the device side code under __device__ inline void Subsample_Nearest + Subsample_Nearest(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} +NEAREST_KERNEL(uchar) +NEAREST_KERNEL(uchar2) +NEAREST_KERNEL(uchar4) + +NEAREST_KERNEL(ushort) +NEAREST_KERNEL(ushort2) +NEAREST_KERNEL(ushort4) } diff --git a/cuda_filter/vf_tonemap_cuda.txt b/cuda_filter/vf_tonemap_cuda.txt new file mode 100644 index 0000000..3938561 --- /dev/null +++ b/cuda_filter/vf_tonemap_cuda.txt @@ -0,0 +1,331 @@ +/* +* Copyright (c) 2021 Felix LeClair +* +*[I don't know if this is correct for a copyright notice, please correct me if wrong] +* +* Derived in part by the work of Nvidia in 2017 on the vf_thumbnail_cuda filter +* +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +*/ + +/* +Changelog + +2021/01/03 +Creation of base files +2021/01/05 +start from scratch- other approach seems silly +2021/01/05 +RTFM and just get my shit together + +This is the C side. +All this file needs to do is: +Negotiate the filter +get the frame +get the information about the frame +pass the frame and information to the cuda side +receive the frame back +send it on in the chain +*/ + + +/** + * Initialize tonemap_cuda + */ +static av_cold int tonemap_cuda_init(AVFilterContext *avctx) +{ + tonemapCUDAContext* ctx = avctx->priv; + ctx->fs.on_event = &tonemap_cuda_blend; + + return 0; +} + +/** + * Uninitialize tonemap_cuda + */ +static av_cold void tonemap_cuda_uninit(AVFilterContext *avctx) +{ + tonemapCUDAContext* ctx = avctx->priv; + + ff_framesync_uninit(&ctx->fs); + + if (ctx->hwctx && ctx->cu_module) { + CUcontext dummy; + CudaFunctions *cu = ctx->hwctx->internal->cuda_dl; + CHECK_CU(cu->cuCtxPushCurrent(ctx->cu_ctx)); + CHECK_CU(cu->cuModuleUnload(ctx->cu_module)); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } +} + +//query_formats() goes here +static int tonemap_cuda_query_formats(AVFilterContext *avctx) +{ + static const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE, + }; + + AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats); + + return ff_set_common_formats(avctx, pix_fmts); +} + + + +//Config_props() goes here +static int tonemap_cuda_config_output(AVFilterLink *outlink) +{ + + extern char vf_tonemap_cuda_ptx[]; + + int err; + AVFilterContext* avctx = outlink->src; + tonemapCUDAContext* ctx = avctx->priv; + + AVFilterLink *inlink = avctx->inputs[0]; + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + + AVFilterLink *inlink_tonemap = avctx->inputs[1]; + AVHWFramesContext *frames_ctx_tonemap = (AVHWFramesContext*)inlink_tonemap->hw_frames_ctx->data; + + CUcontext dummy, cuda_ctx; + CudaFunctions *cu; + + // check main input formats + + if (!frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on main input\n"); + return AVERROR(EINVAL); + } + + ctx->in_format_main = frames_ctx->sw_format; + if (!format_is_supported(supported_main_formats, ctx->in_format_main)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported main input format: %s\n", + av_get_pix_fmt_name(ctx->in_format_main)); + return AVERROR(ENOSYS); + } + + // check tonemap input formats + + if (!frames_ctx_tonemap) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on tonemap input\n"); + return AVERROR(EINVAL); + } + + ctx->in_format_tonemap = frames_ctx_tonemap->sw_format; + if (!format_is_supported(supported_tonemap_formats, ctx->in_format_tonemap)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported tonemap input format: %s\n", + av_get_pix_fmt_name(ctx->in_format_tonemap)); + return AVERROR(ENOSYS); + } + + // check we can tonemap pictures with those pixel formats + + if (!formats_match(ctx->in_format_main, ctx->in_format_tonemap)) { + av_log(ctx, AV_LOG_ERROR, "Can't tonemap %s on %s \n", + av_get_pix_fmt_name(ctx->in_format_tonemap), av_get_pix_fmt_name(ctx->in_format_main)); + return AVERROR(EINVAL); + } + // initialize + + ctx->hwctx = frames_ctx->device_ctx->hwctx; + cuda_ctx = ctx->hwctx->cuda_ctx; + ctx->fs.time_base = inlink->time_base; + + ctx->cu_stream = ctx->hwctx->stream; + + outlink->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); + + // load functions + + cu = ctx->hwctx->internal->cuda_dl; + + err = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (err < 0) { + return err; + } + + err = CHECK_CU(cu->cuModuleLoadData(&ctx->cu_module, vf_tonemap_cuda_ptx)); + if (err < 0) { + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return err; + } + + err = CHECK_CU(cu->cuModuleGetFunction(&ctx->cu_func, ctx->cu_module, "tonemap_Cuda")); + if (err < 0) { + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return err; + } + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + // init dual input + + err = ff_framesync_init_dualinput(&ctx->fs, avctx); + if (err < 0) { + return err; + } + + return ff_framesync_configure(&ctx->fs); +} + +//filer_frame() goes here + + + + + + + +/*NOTICE: this is a test build based on the initial works of the NVIDIA Corporation to create an FF> +tonemapping filter. +This filter will take in a source file that is presumed to be HDR (probably p010) +and convert it to an aproximation of the source content within the SDR/ Rec.709 colour space + +Initially this will be done with the hable filter, as it is easier to implement and relatively simp> + + +Over time I hope to use the BT.2390-8 EOTF, but that is beyond the scope of the initial build +*/ + + + + + +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" + +#include "avfilter.h" +#include "framesync.h" +#include "internal.h" + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, ctx->hwctx->internal->cuda_dl, x) +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) + +#define BLOCK_X 32 +#define BLOCK_Y 16 + +static const enum AVPixelFormat supported_main_formats[] = { + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE, +}; + +static const enum AVPixelFormat supported_tonemap_formats[] = { + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_YUVA420P, + AV_PIX_FMT_NONE, +}; + +/** + * tonemapCUDAContext + */ +typedef struct tonemapCUDAContext { + const AVClass *class; + + enum AVPixelFormat in_format_tonemap; + enum AVPixelFormat in_format_main; + + AVCUDADeviceContext *hwctx; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUstream cu_stream; + + FFFrameSync fs; + + int x_position; + int y_position; + +} tonemapCUDAContext; + +/** + * Helper to find out if provided format is supported by filter + */ +static int format_is_supported(const enum AVPixelFormat formats[], enum AVPixelFormat fmt) +{ + for (int i = 0; formats[i] != AV_PIX_FMT_NONE; i++) + return 1; + return 0; +} + +/** + * Helper checks if we can process main and tonemap pixel formats + */ +static int formats_match(const enum AVPixelFormat format_main, const enum AVPixelFormat format_tonemap) { + switch(format_main) { + case AV_PIX_FMT_NV12: + return format_tonemap == AV_PIX_FMT_NV12; + case AV_PIX_FMT_YUV420P: + return format_tonemap == AV_PIX_FMT_YUV420P || + format_tonemap == AV_PIX_FMT_YUVA420P; + default: + return 0; + } +} + + + + + + + + +//Standard ffmpegs options for documentation + + + +static const AVFilterPad tonemap_cuda_inputs[] = { + { + .name = "main", + .type = AVMEDIA_TYPE_VIDEO, + }, +}; + +static const AVFilterPad tonemap_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = &tonemap_cuda_config_output, + }, + { NULL } +}; + +AVFilter ff_vf_tonemap_cuda = { + .name = "tonemap_cuda", + .description = NULL_IF_CONFIG_SMALL("tonemap video using CUDA"), + .priv_size = sizeof(tonemapCUDAContext), + .priv_class = &tonemap_cuda_class, + .init = &tonemap_cuda_init, + .uninit = &tonemap_cuda_uninit, + .activate = &tonemap_cuda_activate, + .query_formats = &tonemap_cuda_query_formats, + .inputs = tonemap_cuda_inputs, + .outputs = tonemap_cuda_outputs, + .preinit = tonemap_cuda_framesync_preinit, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +};