Spaces:

minhho
/

mimo-1.0

Paused

File size: 15,154 Bytes

6f2c7f0

import numpy as np
import cv2
import glob
import imageio
from PIL import Image
import os

def all_file(file_dir):
    L = []
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            extend = os.path.splitext(file)[1]
            if extend == '.png' or extend == '.jpg' or extend == '.jpeg' or extend == '.JPG' or extend == '.mp4':
                L.append(os.path.join(root, file))
    return L

def crop_img(img, mask):
    # find the bounding box
    x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
    y_max = y + h
    x_max = x + w
    # extend the bounding box with 0.1
    y = max(0, y - int(h * 0.05))
    y_max = min(img.shape[0], y_max + int(h * 0.05))
    return img[y:y_max, x:x_max]

def pad_img(img, color=[255, 255, 255]):
    # pad to square with mod 16 ==0
    h, w = img.shape[:2]
    max_size = max(h, w)
    if max_size % 16 != 0:
        max_size = int(max_size / 16) * 16 + 16
    top = (max_size - h) // 2
    bottom = max_size - h - top
    left = (max_size - w) // 2
    right = max_size - w - left
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    padding_v = [top, bottom, left, right]
    return img, padding_v

def extract_mask_sdc(img):
    # >0 value as human
    mask = np.zeros_like(img[:, :, 0])
    # color to gray
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    # mask[gray[:, :] > 0] = 255
    mask[gray[:, :] > 10] = 255 # !!bug: remove noise
    return mask

def clean_mask(mask):
    se1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
    se2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, se1)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, se2)
    return mask

def crop_img_sdc(img, mask):
    # find the bounding box
    x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
    y_max = y + h
    x_max = x + w
    # y = max(0, y-2)
    pad_h = 0.1
    pad_w = 0.05
    y = max(0, y - int(h * pad_h))
    y_max = min(img.shape[0], y_max + int(h * pad_h))
    x = max(0, x - int(w * pad_w))
    x_max = min(img.shape[1], x_max + int(w * pad_w))
    return y, y_max,x,x_max

def crop_human(pose_images, vid_images, mask_images):
    # find the bbox of the human in the whole frames
    bbox = []
    y = 10000
    y_max = 0
    x = 10000
    x_max = 0
    n_frame = len(pose_images)
    for pose_img in pose_images:
        frame = np.array(pose_img)
        mask = extract_mask_sdc(frame)
        y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
        y = min(y, y_)
        y_max = max(y_max, y_max_)
        x = min(x, x_)
        x_max = max(x_max, x_max_)
    # ensure width and height divisible by 2
    h = y_max - y
    w = x_max - x
    if h % 2 == 1:
        h += 1
        y_max += 1
    if w % 2 == 1:
        w += 1
        x_max += 1
    
    bbox = [x,x_max,y,y_max]

    # crop the human in the whole frames
    frames_res = []
    vid_res = []
    mask_res = []
    for i, pose_img in enumerate(pose_images):
        frame = np.array(pose_img)
        frame = frame[y:y_max, x:x_max]
        frame = Image.fromarray(frame)
        frames_res.append(frame)

        vid = vid_images[i]
        vid = np.array(vid)
        vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

        mask = mask_images[i]
        mask = np.array(mask)
        mask_res.append(Image.fromarray(mask[y:y_max, x:x_max]))
    return frames_res, vid_res, mask_res


def init_bbox():
    return [10000, 0, 10000, 0]

def bbox_div2(x, x_max, y, y_max):
    # ensure width and height divisible by 2
    h = y_max - y
    w = x_max - x
    if h % 2 == 1:
        h += 1
        y_max += 1
    if w % 2 == 1:
        w += 1
        x_max += 1
    return x, x_max, y, y_max

def bbox_pad(x, x_max, y, y_max, img):
    w = x_max - x
    h = y_max - y
    # pad to square with mod 16 ==0
    max_size = max(h, w)
    if max_size % 16 != 0:
        max_size = int(max_size / 16) * 16 + 16
    top = (max_size - h) // 2
    bottom = max_size - h - top
    left = (max_size - w) // 2
    right = max_size - w - left

    y = max(0, y-top)
    y_max = min(img.shape[0], y_max+bottom)
    x = max(0, x-left)
    x_max = min(img.shape[1], x_max+right)

    return x, x_max, y, y_max

def compute_area_ratio(bbox_frame, bbox_clip):
    x1, x2, y1, y2 = bbox_frame
    x1_clip, x2_clip, y1_clip, y2_clip = bbox_clip
    area_frame = (x2 - x1) * (y2 - y1)
    area_clip = (x2_clip - x1_clip) * (y2_clip - y1_clip)
    ratio = area_frame / area_clip
    return ratio

def update_clip(bbox_clip, start_idx, i, bbox_max):
    x, x_max, y, y_max = bbox_max
    for j in range(start_idx, i):
        bbox_clip[j] = [x, x_max, y, y_max]

def crop_human_clip_auto_context(pose_images, vid_images, bk_images, overlay=4):
    # find the bbox of the human in the clip frames
    bbox_clip = []
    bbox_perframe = []
    ratio_list = []
    x, x_max, y, y_max = init_bbox()
    n_frame = len(pose_images)

    context_list = []
    bbox_clip_list = []

    areas = np.zeros(n_frame)
    start_idx = 0
    for i in range(0, n_frame):
        # print('i:', i)
        pose_img = pose_images[i]
        frame = np.array(pose_img)
        mask = extract_mask_sdc(frame)
        mask = clean_mask(mask)
        y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
        x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
        x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)
        bbox_max_prev = (x, x_max, y, y_max)

        # update max
        y = min(y, y_)
        y_max = max(y_max, y_max_)
        x = min(x, x_)
        x_max = max(x_max, x_max_)
        bbox_max_cur = (x, x_max, y, y_max)

        # save bbox per frame
        bbox_cur = [x_, x_max_, y_, y_max_]
        bbox_perframe.append(bbox_cur)
        bbox_clip.append(bbox_cur)

        # compute the area of each frame
        area = (x_max_ - x_) * (y_max_ - y_)/100
        areas[i] = area
        area_max = (y_max - y) * (x_max - x)/100
        if area_max!=0:
            ratios = areas[start_idx:i]/area_max
        else:
            ratios = np.zeros(i-start_idx)

        # ROI_THE = 0.2
        ROI_THE = 0.5
        if (i == n_frame - 1):
            i += 1
            # print('update from ')
            # print('start_idx:', start_idx)
            # print('i:', i)

            # print('clip from to:', range(start_idx, i))
            if len(context_list)==0:
                context_list.append(list(range(start_idx, i)))
            else:
                overlay_ = min(overlay, len(context_list[-1]))
                context_list.append(list(range(start_idx-overlay_, i)))
            bbox_clip_list.append(bbox_max_cur)

            update_clip(bbox_clip, start_idx, i, bbox_max_cur)
            start_idx = i
            continue
        elif np.any(ratios < ROI_THE) and ratios.sum()!=0:

            # generate a list from start_idx to i
            if len(context_list)==0:
                context_list.append(list(range(start_idx, i)))
            else:
                overlay_ = min(overlay, len(context_list[-1]))
                context_list.append(list(range(start_idx-overlay_, i)))
            bbox_clip_list.append(bbox_max_prev)

            # print('update from ')
            # print('start_idx:', start_idx)
            # print('i:', i)
            update_clip(bbox_clip, start_idx, i, bbox_max_prev)
            x, x_max, y, y_max = bbox_cur
            start_idx = i
            continue

    # vis ratio
    for i in range(0, n_frame):
        # print('i:', i)
        bbox_frame_ = bbox_perframe[i]
        bbox_clip_ = bbox_clip[i]
        # print('bbox_frame_:', bbox_frame_)
        # print('bbox_clip_:', bbox_clip_)
        if np.array(bbox_clip_).sum()==0:
            ratio = 0
        else:
            ratio = compute_area_ratio(bbox_frame_, bbox_clip_)
        # print('ratio:', ratio)
        ratio_list.append(ratio)

    # crop images
    frames_res = []
    vid_res = []
    bk_res = []
    for k, context in enumerate(context_list):
        for i in context:
            pose_img = pose_images[i]
            frame = np.array(pose_img)
            x, x_max, y, y_max = bbox_clip_list[k]
            if x >= x_max or y >= y_max:
                x, x_max, y, y_max = 0, frame.shape[1] - 1, 0, frame.shape[0] - 1
            frame = frame[y:y_max, x:x_max]
            frame = Image.fromarray(frame)
            frames_res.append(frame)

            vid = vid_images[i]
            vid = np.array(vid)
            vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

            bk = bk_images[i]
            bk = np.array(bk)
            bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))

    return frames_res, vid_res, bk_res, bbox_clip, context_list, bbox_clip_list


def crop_human_clip(pose_images, vid_images, bk_images, clip_length=1):
    # find the bbox of the human in the clip frames
    bbox_clip = []
    x, x_max, y, y_max = init_bbox()
    n_frame = len(pose_images)
    for i in range(0, n_frame):
        # print('i:', i)
        pose_img = pose_images[i]
        frame = np.array(pose_img)
        mask = extract_mask_sdc(frame)
        mask = clean_mask(mask)
        y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
        x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
        x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)
        
        # print(x_,x_max_,y_,y_max_)

        y = min(y, y_)
        y_max = max(y_max, y_max_)
        x = min(x, x_)
        x_max = max(x_max, x_max_)
        # print(x,x_max,y,y_max)

        if ((i+1) % clip_length == 0) or (i==n_frame-1):
            x, x_max, y, y_max = bbox_div2(x, x_max, y, y_max)
            if x>=x_max or y>=y_max:
                x, x_max, y, y_max = 0, frame.shape[1]-1, 0, frame.shape[0]-1
            # print(x,x_max,y,y_max)
            bbox_clip.append([x, x_max, y, y_max])
            x, x_max, y, y_max = init_bbox()
    # crop images
    frames_res = []
    vid_res = []
    bk_res = []
    for i, pose_img in enumerate(pose_images):
        x, x_max, y, y_max = bbox_clip[i//clip_length]
        frame = np.array(pose_img)
        frame = frame[y:y_max, x:x_max]
        frame = Image.fromarray(frame)
        frames_res.append(frame)

        vid = vid_images[i]
        vid = np.array(vid)
        vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))

        bk = bk_images[i]
        bk = np.array(bk)
        bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))
    return frames_res, vid_res, bk_res, bbox_clip


def init_bk(n_frame,h,w):
    images = []
    for i in range(n_frame):
        img = np.ones((h, w, 3), dtype=np.uint8) * 255
        images.append(Image.fromarray(img))
    return images



def pose_adjust(pose_image, width=512, height=784):
    canvas = np.zeros((height, width, 3), dtype=np.uint8)
    # PIL to numpy
    pose_img = np.array(pose_image)
    h, w, c = pose_img.shape
    # print('pose_img:', pose_img.shape)
    # resize
    # pose_img = cv2.resize(pose_img, (width, int(h * width / w)), interpolation=cv2.INTER_AREA)
    nh, nw = height, int(w * height / h)
    pose_img = cv2.resize(pose_img, (nw, nh), interpolation=cv2.INTER_AREA)
    if nw < width:
        # pad
        pad = (width - nw) // 2
        canvas[:, pad:pad + nw, :] = pose_img
    else:
        # center crop
        crop = (nw - width) // 2
        canvas = pose_img[:, crop:crop + width, :]

    # numpy to PIL
    canvas = Image.fromarray(canvas)
    return canvas


def load_pretrain_pose_guider(model, ckpt_path):

    state_dict = torch.load(ckpt_path, map_location="cpu")
    # for k,v in state_dict.items():
        # print(k, v.shape)

    weights = state_dict['conv_in.weight']
    # _,c,_,_ = weights.shape
    # if c!=
    weights = torch.cat((weights, torch.zeros_like(weights), torch.zeros_like(weights)), dim=1)
    state_dict['conv_in.weight'] = weights

    model.load_state_dict(state_dict, strict=True)

    return model

def refine_img_prepross(image, mask):
    im_ary = np.asarray(image).astype(np.float32)
    input = np.concatenate([im_ary, mask[:, :, np.newaxis]], axis=-1)
    return input

mask_mode = {'up_down_left_right': 0, 'left_right_up': 1, 'left_right_down': 2, 'up_down_left': 3, 'up_down_right': 4,
            'left_right': 5, 'up_down': 6, 'left_up': 7, 'right_up': 8, 'left_down': 9, 'right_down': 10,
             'left': 11, 'right': 12, 'up': 13, 'down': 14, 'inner': 15}

def get_mask(mask_list, bbox, img):
    w, h = img.size
    # print('size w h:', w, h)
    # print('bbox:', bbox)
    w_min, w_max, h_min, h_max = bbox
    if w_min<=0 and w_max>=w and h_min<=0 and h_max>=h: # up_down_left_right
        mode = 'up_down_left_right'
    elif w_min<=0 and w_max>=w and h_min<=0:
        mode = 'left_right_up'
    elif w_min<=0 and w_max>=w and h_max>=h:
        mode = 'left_right_down'
    elif w_min <= 0 and h_min <= 0 and h_max >= h:
        mode = 'up_down_left'
    elif w_max >= w and h_min <= 0 and h_max >= h:
        mode = 'up_down_right'

    elif w_min<=0 and w_max>=w: #
        mode = 'left_right'
    elif h_min<=0 and h_max>=h: #
        mode = 'up_down'
    elif w_min<=0 and h_min<=0: # left_up
        mode = 'left_up'
    elif w_max>=w and h_min<=0: # right_up5
        mode = 'right_up'
    elif w_min<=0 and h_max>=h: # left_down6
        mode = 'left_down'
    elif w_max>=w and h_max>=h: # right_down7
        mode = 'right_down'

    elif w_min<=0:
        mode = 'left'
    elif w_max>=w:
        mode = 'right'
    elif h_min<=0:
        mode = 'up'
    elif h_max>=h:
        mode = 'down'
    else:
        mode = 'inner'

    mask = mask_list[mask_mode[mode]]

    return mask

def load_mask_list(mask_path):
    mask_list = []
    for key in mask_mode.keys():
        mask = cv2.imread(mask_path[:-4] + '_%s.png'%key)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
        mask_list.append(mask)
    return mask_list

def recover_bk(images, start_idx, end_idx, template_name=None):
    img = np.array(images[0])
    for i in range(start_idx, end_idx):
        if template_name == "dance_indoor_1":
            images[i][:img.shape[0], :, 0] = 255
            images[i][:img.shape[0], :, 1] = 255
            images[i][:img.shape[0], :, 2] = 255
        else:
            img_blank = np.ones_like(img) * 255
            images[i] = Image.fromarray(img_blank)
    return images


def load_video_fixed_fps(vid_path, target_fps=30, target_speed=1):
    # Load video and get metadata
    reader = imageio.get_reader(vid_path)
    fps = round(reader.get_meta_data()['fps'])
    # print('original fps:', fps)
    # print('target fps:', target_fps)

    # Calculate the ratio of original fps to target fps to determine which frames to keep
    keep_ratio = target_speed * fps / target_fps
    n_frames = reader.count_frames()
    keep_frames_indices = np.arange(0, n_frames, keep_ratio).astype(int)

    # Extract frames at the target frame rate
    frames = [Image.fromarray(reader.get_data(i)) for i in keep_frames_indices if i < len(reader)]        

    reader.close()
    return frames