mimo-1.0 / tools /util.py
minhho's picture
Clean deployment: All fixes without binary files
6f2c7f0
import numpy as np
import cv2
import glob
import imageio
from PIL import Image
import os
def all_file(file_dir):
L = []
for root, dirs, files in os.walk(file_dir):
for file in files:
extend = os.path.splitext(file)[1]
if extend == '.png' or extend == '.jpg' or extend == '.jpeg' or extend == '.JPG' or extend == '.mp4':
L.append(os.path.join(root, file))
return L
def crop_img(img, mask):
# find the bounding box
x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
y_max = y + h
x_max = x + w
# extend the bounding box with 0.1
y = max(0, y - int(h * 0.05))
y_max = min(img.shape[0], y_max + int(h * 0.05))
return img[y:y_max, x:x_max]
def pad_img(img, color=[255, 255, 255]):
# pad to square with mod 16 ==0
h, w = img.shape[:2]
max_size = max(h, w)
if max_size % 16 != 0:
max_size = int(max_size / 16) * 16 + 16
top = (max_size - h) // 2
bottom = max_size - h - top
left = (max_size - w) // 2
right = max_size - w - left
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
padding_v = [top, bottom, left, right]
return img, padding_v
def extract_mask_sdc(img):
# >0 value as human
mask = np.zeros_like(img[:, :, 0])
# color to gray
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
# mask[gray[:, :] > 0] = 255
mask[gray[:, :] > 10] = 255 # !!bug: remove noise
return mask
def clean_mask(mask):
se1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
se2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, se1)
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, se2)
return mask
def crop_img_sdc(img, mask):
# find the bounding box
x, y, w, h = cv2.boundingRect(mask) #91 85 554 1836
y_max = y + h
x_max = x + w
# y = max(0, y-2)
pad_h = 0.1
pad_w = 0.05
y = max(0, y - int(h * pad_h))
y_max = min(img.shape[0], y_max + int(h * pad_h))
x = max(0, x - int(w * pad_w))
x_max = min(img.shape[1], x_max + int(w * pad_w))
return y, y_max,x,x_max
def crop_human(pose_images, vid_images, mask_images):
# find the bbox of the human in the whole frames
bbox = []
y = 10000
y_max = 0
x = 10000
x_max = 0
n_frame = len(pose_images)
for pose_img in pose_images:
frame = np.array(pose_img)
mask = extract_mask_sdc(frame)
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
y = min(y, y_)
y_max = max(y_max, y_max_)
x = min(x, x_)
x_max = max(x_max, x_max_)
# ensure width and height divisible by 2
h = y_max - y
w = x_max - x
if h % 2 == 1:
h += 1
y_max += 1
if w % 2 == 1:
w += 1
x_max += 1
bbox = [x,x_max,y,y_max]
# crop the human in the whole frames
frames_res = []
vid_res = []
mask_res = []
for i, pose_img in enumerate(pose_images):
frame = np.array(pose_img)
frame = frame[y:y_max, x:x_max]
frame = Image.fromarray(frame)
frames_res.append(frame)
vid = vid_images[i]
vid = np.array(vid)
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))
mask = mask_images[i]
mask = np.array(mask)
mask_res.append(Image.fromarray(mask[y:y_max, x:x_max]))
return frames_res, vid_res, mask_res
def init_bbox():
return [10000, 0, 10000, 0]
def bbox_div2(x, x_max, y, y_max):
# ensure width and height divisible by 2
h = y_max - y
w = x_max - x
if h % 2 == 1:
h += 1
y_max += 1
if w % 2 == 1:
w += 1
x_max += 1
return x, x_max, y, y_max
def bbox_pad(x, x_max, y, y_max, img):
w = x_max - x
h = y_max - y
# pad to square with mod 16 ==0
max_size = max(h, w)
if max_size % 16 != 0:
max_size = int(max_size / 16) * 16 + 16
top = (max_size - h) // 2
bottom = max_size - h - top
left = (max_size - w) // 2
right = max_size - w - left
y = max(0, y-top)
y_max = min(img.shape[0], y_max+bottom)
x = max(0, x-left)
x_max = min(img.shape[1], x_max+right)
return x, x_max, y, y_max
def compute_area_ratio(bbox_frame, bbox_clip):
x1, x2, y1, y2 = bbox_frame
x1_clip, x2_clip, y1_clip, y2_clip = bbox_clip
area_frame = (x2 - x1) * (y2 - y1)
area_clip = (x2_clip - x1_clip) * (y2_clip - y1_clip)
ratio = area_frame / area_clip
return ratio
def update_clip(bbox_clip, start_idx, i, bbox_max):
x, x_max, y, y_max = bbox_max
for j in range(start_idx, i):
bbox_clip[j] = [x, x_max, y, y_max]
def crop_human_clip_auto_context(pose_images, vid_images, bk_images, overlay=4):
# find the bbox of the human in the clip frames
bbox_clip = []
bbox_perframe = []
ratio_list = []
x, x_max, y, y_max = init_bbox()
n_frame = len(pose_images)
context_list = []
bbox_clip_list = []
areas = np.zeros(n_frame)
start_idx = 0
for i in range(0, n_frame):
# print('i:', i)
pose_img = pose_images[i]
frame = np.array(pose_img)
mask = extract_mask_sdc(frame)
mask = clean_mask(mask)
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)
bbox_max_prev = (x, x_max, y, y_max)
# update max
y = min(y, y_)
y_max = max(y_max, y_max_)
x = min(x, x_)
x_max = max(x_max, x_max_)
bbox_max_cur = (x, x_max, y, y_max)
# save bbox per frame
bbox_cur = [x_, x_max_, y_, y_max_]
bbox_perframe.append(bbox_cur)
bbox_clip.append(bbox_cur)
# compute the area of each frame
area = (x_max_ - x_) * (y_max_ - y_)/100
areas[i] = area
area_max = (y_max - y) * (x_max - x)/100
if area_max!=0:
ratios = areas[start_idx:i]/area_max
else:
ratios = np.zeros(i-start_idx)
# ROI_THE = 0.2
ROI_THE = 0.5
if (i == n_frame - 1):
i += 1
# print('update from ')
# print('start_idx:', start_idx)
# print('i:', i)
# print('clip from to:', range(start_idx, i))
if len(context_list)==0:
context_list.append(list(range(start_idx, i)))
else:
overlay_ = min(overlay, len(context_list[-1]))
context_list.append(list(range(start_idx-overlay_, i)))
bbox_clip_list.append(bbox_max_cur)
update_clip(bbox_clip, start_idx, i, bbox_max_cur)
start_idx = i
continue
elif np.any(ratios < ROI_THE) and ratios.sum()!=0:
# generate a list from start_idx to i
if len(context_list)==0:
context_list.append(list(range(start_idx, i)))
else:
overlay_ = min(overlay, len(context_list[-1]))
context_list.append(list(range(start_idx-overlay_, i)))
bbox_clip_list.append(bbox_max_prev)
# print('update from ')
# print('start_idx:', start_idx)
# print('i:', i)
update_clip(bbox_clip, start_idx, i, bbox_max_prev)
x, x_max, y, y_max = bbox_cur
start_idx = i
continue
# vis ratio
for i in range(0, n_frame):
# print('i:', i)
bbox_frame_ = bbox_perframe[i]
bbox_clip_ = bbox_clip[i]
# print('bbox_frame_:', bbox_frame_)
# print('bbox_clip_:', bbox_clip_)
if np.array(bbox_clip_).sum()==0:
ratio = 0
else:
ratio = compute_area_ratio(bbox_frame_, bbox_clip_)
# print('ratio:', ratio)
ratio_list.append(ratio)
# crop images
frames_res = []
vid_res = []
bk_res = []
for k, context in enumerate(context_list):
for i in context:
pose_img = pose_images[i]
frame = np.array(pose_img)
x, x_max, y, y_max = bbox_clip_list[k]
if x >= x_max or y >= y_max:
x, x_max, y, y_max = 0, frame.shape[1] - 1, 0, frame.shape[0] - 1
frame = frame[y:y_max, x:x_max]
frame = Image.fromarray(frame)
frames_res.append(frame)
vid = vid_images[i]
vid = np.array(vid)
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))
bk = bk_images[i]
bk = np.array(bk)
bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))
return frames_res, vid_res, bk_res, bbox_clip, context_list, bbox_clip_list
def crop_human_clip(pose_images, vid_images, bk_images, clip_length=1):
# find the bbox of the human in the clip frames
bbox_clip = []
x, x_max, y, y_max = init_bbox()
n_frame = len(pose_images)
for i in range(0, n_frame):
# print('i:', i)
pose_img = pose_images[i]
frame = np.array(pose_img)
mask = extract_mask_sdc(frame)
mask = clean_mask(mask)
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask)
x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_)
x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame)
# print(x_,x_max_,y_,y_max_)
y = min(y, y_)
y_max = max(y_max, y_max_)
x = min(x, x_)
x_max = max(x_max, x_max_)
# print(x,x_max,y,y_max)
if ((i+1) % clip_length == 0) or (i==n_frame-1):
x, x_max, y, y_max = bbox_div2(x, x_max, y, y_max)
if x>=x_max or y>=y_max:
x, x_max, y, y_max = 0, frame.shape[1]-1, 0, frame.shape[0]-1
# print(x,x_max,y,y_max)
bbox_clip.append([x, x_max, y, y_max])
x, x_max, y, y_max = init_bbox()
# crop images
frames_res = []
vid_res = []
bk_res = []
for i, pose_img in enumerate(pose_images):
x, x_max, y, y_max = bbox_clip[i//clip_length]
frame = np.array(pose_img)
frame = frame[y:y_max, x:x_max]
frame = Image.fromarray(frame)
frames_res.append(frame)
vid = vid_images[i]
vid = np.array(vid)
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max]))
bk = bk_images[i]
bk = np.array(bk)
bk_res.append(Image.fromarray(bk[y:y_max, x:x_max]))
return frames_res, vid_res, bk_res, bbox_clip
def init_bk(n_frame,h,w):
images = []
for i in range(n_frame):
img = np.ones((h, w, 3), dtype=np.uint8) * 255
images.append(Image.fromarray(img))
return images
def pose_adjust(pose_image, width=512, height=784):
canvas = np.zeros((height, width, 3), dtype=np.uint8)
# PIL to numpy
pose_img = np.array(pose_image)
h, w, c = pose_img.shape
# print('pose_img:', pose_img.shape)
# resize
# pose_img = cv2.resize(pose_img, (width, int(h * width / w)), interpolation=cv2.INTER_AREA)
nh, nw = height, int(w * height / h)
pose_img = cv2.resize(pose_img, (nw, nh), interpolation=cv2.INTER_AREA)
if nw < width:
# pad
pad = (width - nw) // 2
canvas[:, pad:pad + nw, :] = pose_img
else:
# center crop
crop = (nw - width) // 2
canvas = pose_img[:, crop:crop + width, :]
# numpy to PIL
canvas = Image.fromarray(canvas)
return canvas
def load_pretrain_pose_guider(model, ckpt_path):
state_dict = torch.load(ckpt_path, map_location="cpu")
# for k,v in state_dict.items():
# print(k, v.shape)
weights = state_dict['conv_in.weight']
# _,c,_,_ = weights.shape
# if c!=
weights = torch.cat((weights, torch.zeros_like(weights), torch.zeros_like(weights)), dim=1)
state_dict['conv_in.weight'] = weights
model.load_state_dict(state_dict, strict=True)
return model
def refine_img_prepross(image, mask):
im_ary = np.asarray(image).astype(np.float32)
input = np.concatenate([im_ary, mask[:, :, np.newaxis]], axis=-1)
return input
mask_mode = {'up_down_left_right': 0, 'left_right_up': 1, 'left_right_down': 2, 'up_down_left': 3, 'up_down_right': 4,
'left_right': 5, 'up_down': 6, 'left_up': 7, 'right_up': 8, 'left_down': 9, 'right_down': 10,
'left': 11, 'right': 12, 'up': 13, 'down': 14, 'inner': 15}
def get_mask(mask_list, bbox, img):
w, h = img.size
# print('size w h:', w, h)
# print('bbox:', bbox)
w_min, w_max, h_min, h_max = bbox
if w_min<=0 and w_max>=w and h_min<=0 and h_max>=h: # up_down_left_right
mode = 'up_down_left_right'
elif w_min<=0 and w_max>=w and h_min<=0:
mode = 'left_right_up'
elif w_min<=0 and w_max>=w and h_max>=h:
mode = 'left_right_down'
elif w_min <= 0 and h_min <= 0 and h_max >= h:
mode = 'up_down_left'
elif w_max >= w and h_min <= 0 and h_max >= h:
mode = 'up_down_right'
elif w_min<=0 and w_max>=w: #
mode = 'left_right'
elif h_min<=0 and h_max>=h: #
mode = 'up_down'
elif w_min<=0 and h_min<=0: # left_up
mode = 'left_up'
elif w_max>=w and h_min<=0: # right_up5
mode = 'right_up'
elif w_min<=0 and h_max>=h: # left_down6
mode = 'left_down'
elif w_max>=w and h_max>=h: # right_down7
mode = 'right_down'
elif w_min<=0:
mode = 'left'
elif w_max>=w:
mode = 'right'
elif h_min<=0:
mode = 'up'
elif h_max>=h:
mode = 'down'
else:
mode = 'inner'
mask = mask_list[mask_mode[mode]]
return mask
def load_mask_list(mask_path):
mask_list = []
for key in mask_mode.keys():
mask = cv2.imread(mask_path[:-4] + '_%s.png'%key)
mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
mask_list.append(mask)
return mask_list
def recover_bk(images, start_idx, end_idx, template_name=None):
img = np.array(images[0])
for i in range(start_idx, end_idx):
if template_name == "dance_indoor_1":
images[i][:img.shape[0], :, 0] = 255
images[i][:img.shape[0], :, 1] = 255
images[i][:img.shape[0], :, 2] = 255
else:
img_blank = np.ones_like(img) * 255
images[i] = Image.fromarray(img_blank)
return images
def load_video_fixed_fps(vid_path, target_fps=30, target_speed=1):
# Load video and get metadata
reader = imageio.get_reader(vid_path)
fps = round(reader.get_meta_data()['fps'])
# print('original fps:', fps)
# print('target fps:', target_fps)
# Calculate the ratio of original fps to target fps to determine which frames to keep
keep_ratio = target_speed * fps / target_fps
n_frames = reader.count_frames()
keep_frames_indices = np.arange(0, n_frames, keep_ratio).astype(int)
# Extract frames at the target frame rate
frames = [Image.fromarray(reader.get_data(i)) for i in keep_frames_indices if i < len(reader)]
reader.close()
return frames