|
|
import numpy as np |
|
|
import cv2 |
|
|
import glob |
|
|
import imageio |
|
|
from PIL import Image |
|
|
import os |
|
|
|
|
|
def all_file(file_dir): |
|
|
L = [] |
|
|
for root, dirs, files in os.walk(file_dir): |
|
|
for file in files: |
|
|
extend = os.path.splitext(file)[1] |
|
|
if extend == '.png' or extend == '.jpg' or extend == '.jpeg' or extend == '.JPG' or extend == '.mp4': |
|
|
L.append(os.path.join(root, file)) |
|
|
return L |
|
|
|
|
|
def crop_img(img, mask): |
|
|
|
|
|
x, y, w, h = cv2.boundingRect(mask) |
|
|
y_max = y + h |
|
|
x_max = x + w |
|
|
|
|
|
y = max(0, y - int(h * 0.05)) |
|
|
y_max = min(img.shape[0], y_max + int(h * 0.05)) |
|
|
return img[y:y_max, x:x_max] |
|
|
|
|
|
def pad_img(img, color=[255, 255, 255]): |
|
|
|
|
|
h, w = img.shape[:2] |
|
|
max_size = max(h, w) |
|
|
if max_size % 16 != 0: |
|
|
max_size = int(max_size / 16) * 16 + 16 |
|
|
top = (max_size - h) // 2 |
|
|
bottom = max_size - h - top |
|
|
left = (max_size - w) // 2 |
|
|
right = max_size - w - left |
|
|
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) |
|
|
padding_v = [top, bottom, left, right] |
|
|
return img, padding_v |
|
|
|
|
|
def extract_mask_sdc(img): |
|
|
|
|
|
mask = np.zeros_like(img[:, :, 0]) |
|
|
|
|
|
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
|
|
|
|
|
mask[gray[:, :] > 10] = 255 |
|
|
return mask |
|
|
|
|
|
def clean_mask(mask): |
|
|
se1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) |
|
|
se2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2)) |
|
|
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, se1) |
|
|
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, se2) |
|
|
return mask |
|
|
|
|
|
def crop_img_sdc(img, mask): |
|
|
|
|
|
x, y, w, h = cv2.boundingRect(mask) |
|
|
y_max = y + h |
|
|
x_max = x + w |
|
|
|
|
|
pad_h = 0.1 |
|
|
pad_w = 0.05 |
|
|
y = max(0, y - int(h * pad_h)) |
|
|
y_max = min(img.shape[0], y_max + int(h * pad_h)) |
|
|
x = max(0, x - int(w * pad_w)) |
|
|
x_max = min(img.shape[1], x_max + int(w * pad_w)) |
|
|
return y, y_max,x,x_max |
|
|
|
|
|
def crop_human(pose_images, vid_images, mask_images): |
|
|
|
|
|
bbox = [] |
|
|
y = 10000 |
|
|
y_max = 0 |
|
|
x = 10000 |
|
|
x_max = 0 |
|
|
n_frame = len(pose_images) |
|
|
for pose_img in pose_images: |
|
|
frame = np.array(pose_img) |
|
|
mask = extract_mask_sdc(frame) |
|
|
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) |
|
|
y = min(y, y_) |
|
|
y_max = max(y_max, y_max_) |
|
|
x = min(x, x_) |
|
|
x_max = max(x_max, x_max_) |
|
|
|
|
|
h = y_max - y |
|
|
w = x_max - x |
|
|
if h % 2 == 1: |
|
|
h += 1 |
|
|
y_max += 1 |
|
|
if w % 2 == 1: |
|
|
w += 1 |
|
|
x_max += 1 |
|
|
|
|
|
bbox = [x,x_max,y,y_max] |
|
|
|
|
|
|
|
|
frames_res = [] |
|
|
vid_res = [] |
|
|
mask_res = [] |
|
|
for i, pose_img in enumerate(pose_images): |
|
|
frame = np.array(pose_img) |
|
|
frame = frame[y:y_max, x:x_max] |
|
|
frame = Image.fromarray(frame) |
|
|
frames_res.append(frame) |
|
|
|
|
|
vid = vid_images[i] |
|
|
vid = np.array(vid) |
|
|
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) |
|
|
|
|
|
mask = mask_images[i] |
|
|
mask = np.array(mask) |
|
|
mask_res.append(Image.fromarray(mask[y:y_max, x:x_max])) |
|
|
return frames_res, vid_res, mask_res |
|
|
|
|
|
|
|
|
def init_bbox(): |
|
|
return [10000, 0, 10000, 0] |
|
|
|
|
|
def bbox_div2(x, x_max, y, y_max): |
|
|
|
|
|
h = y_max - y |
|
|
w = x_max - x |
|
|
if h % 2 == 1: |
|
|
h += 1 |
|
|
y_max += 1 |
|
|
if w % 2 == 1: |
|
|
w += 1 |
|
|
x_max += 1 |
|
|
return x, x_max, y, y_max |
|
|
|
|
|
def bbox_pad(x, x_max, y, y_max, img): |
|
|
w = x_max - x |
|
|
h = y_max - y |
|
|
|
|
|
max_size = max(h, w) |
|
|
if max_size % 16 != 0: |
|
|
max_size = int(max_size / 16) * 16 + 16 |
|
|
top = (max_size - h) // 2 |
|
|
bottom = max_size - h - top |
|
|
left = (max_size - w) // 2 |
|
|
right = max_size - w - left |
|
|
|
|
|
y = max(0, y-top) |
|
|
y_max = min(img.shape[0], y_max+bottom) |
|
|
x = max(0, x-left) |
|
|
x_max = min(img.shape[1], x_max+right) |
|
|
|
|
|
return x, x_max, y, y_max |
|
|
|
|
|
def compute_area_ratio(bbox_frame, bbox_clip): |
|
|
x1, x2, y1, y2 = bbox_frame |
|
|
x1_clip, x2_clip, y1_clip, y2_clip = bbox_clip |
|
|
area_frame = (x2 - x1) * (y2 - y1) |
|
|
area_clip = (x2_clip - x1_clip) * (y2_clip - y1_clip) |
|
|
ratio = area_frame / area_clip |
|
|
return ratio |
|
|
|
|
|
def update_clip(bbox_clip, start_idx, i, bbox_max): |
|
|
x, x_max, y, y_max = bbox_max |
|
|
for j in range(start_idx, i): |
|
|
bbox_clip[j] = [x, x_max, y, y_max] |
|
|
|
|
|
def crop_human_clip_auto_context(pose_images, vid_images, bk_images, overlay=4): |
|
|
|
|
|
bbox_clip = [] |
|
|
bbox_perframe = [] |
|
|
ratio_list = [] |
|
|
x, x_max, y, y_max = init_bbox() |
|
|
n_frame = len(pose_images) |
|
|
|
|
|
context_list = [] |
|
|
bbox_clip_list = [] |
|
|
|
|
|
areas = np.zeros(n_frame) |
|
|
start_idx = 0 |
|
|
for i in range(0, n_frame): |
|
|
|
|
|
pose_img = pose_images[i] |
|
|
frame = np.array(pose_img) |
|
|
mask = extract_mask_sdc(frame) |
|
|
mask = clean_mask(mask) |
|
|
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) |
|
|
x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_) |
|
|
x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame) |
|
|
bbox_max_prev = (x, x_max, y, y_max) |
|
|
|
|
|
|
|
|
y = min(y, y_) |
|
|
y_max = max(y_max, y_max_) |
|
|
x = min(x, x_) |
|
|
x_max = max(x_max, x_max_) |
|
|
bbox_max_cur = (x, x_max, y, y_max) |
|
|
|
|
|
|
|
|
bbox_cur = [x_, x_max_, y_, y_max_] |
|
|
bbox_perframe.append(bbox_cur) |
|
|
bbox_clip.append(bbox_cur) |
|
|
|
|
|
|
|
|
area = (x_max_ - x_) * (y_max_ - y_)/100 |
|
|
areas[i] = area |
|
|
area_max = (y_max - y) * (x_max - x)/100 |
|
|
if area_max!=0: |
|
|
ratios = areas[start_idx:i]/area_max |
|
|
else: |
|
|
ratios = np.zeros(i-start_idx) |
|
|
|
|
|
|
|
|
ROI_THE = 0.5 |
|
|
if (i == n_frame - 1): |
|
|
i += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(context_list)==0: |
|
|
context_list.append(list(range(start_idx, i))) |
|
|
else: |
|
|
overlay_ = min(overlay, len(context_list[-1])) |
|
|
context_list.append(list(range(start_idx-overlay_, i))) |
|
|
bbox_clip_list.append(bbox_max_cur) |
|
|
|
|
|
update_clip(bbox_clip, start_idx, i, bbox_max_cur) |
|
|
start_idx = i |
|
|
continue |
|
|
elif np.any(ratios < ROI_THE) and ratios.sum()!=0: |
|
|
|
|
|
|
|
|
if len(context_list)==0: |
|
|
context_list.append(list(range(start_idx, i))) |
|
|
else: |
|
|
overlay_ = min(overlay, len(context_list[-1])) |
|
|
context_list.append(list(range(start_idx-overlay_, i))) |
|
|
bbox_clip_list.append(bbox_max_prev) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
update_clip(bbox_clip, start_idx, i, bbox_max_prev) |
|
|
x, x_max, y, y_max = bbox_cur |
|
|
start_idx = i |
|
|
continue |
|
|
|
|
|
|
|
|
for i in range(0, n_frame): |
|
|
|
|
|
bbox_frame_ = bbox_perframe[i] |
|
|
bbox_clip_ = bbox_clip[i] |
|
|
|
|
|
|
|
|
if np.array(bbox_clip_).sum()==0: |
|
|
ratio = 0 |
|
|
else: |
|
|
ratio = compute_area_ratio(bbox_frame_, bbox_clip_) |
|
|
|
|
|
ratio_list.append(ratio) |
|
|
|
|
|
|
|
|
frames_res = [] |
|
|
vid_res = [] |
|
|
bk_res = [] |
|
|
for k, context in enumerate(context_list): |
|
|
for i in context: |
|
|
pose_img = pose_images[i] |
|
|
frame = np.array(pose_img) |
|
|
x, x_max, y, y_max = bbox_clip_list[k] |
|
|
if x >= x_max or y >= y_max: |
|
|
x, x_max, y, y_max = 0, frame.shape[1] - 1, 0, frame.shape[0] - 1 |
|
|
frame = frame[y:y_max, x:x_max] |
|
|
frame = Image.fromarray(frame) |
|
|
frames_res.append(frame) |
|
|
|
|
|
vid = vid_images[i] |
|
|
vid = np.array(vid) |
|
|
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) |
|
|
|
|
|
bk = bk_images[i] |
|
|
bk = np.array(bk) |
|
|
bk_res.append(Image.fromarray(bk[y:y_max, x:x_max])) |
|
|
|
|
|
return frames_res, vid_res, bk_res, bbox_clip, context_list, bbox_clip_list |
|
|
|
|
|
|
|
|
def crop_human_clip(pose_images, vid_images, bk_images, clip_length=1): |
|
|
|
|
|
bbox_clip = [] |
|
|
x, x_max, y, y_max = init_bbox() |
|
|
n_frame = len(pose_images) |
|
|
for i in range(0, n_frame): |
|
|
|
|
|
pose_img = pose_images[i] |
|
|
frame = np.array(pose_img) |
|
|
mask = extract_mask_sdc(frame) |
|
|
mask = clean_mask(mask) |
|
|
y_, y_max_, x_, x_max_ = crop_img_sdc(frame, mask) |
|
|
x_, x_max_, y_, y_max_ = bbox_div2(x_, x_max_, y_, y_max_) |
|
|
x_, x_max_, y_, y_max_ = bbox_pad(x_, x_max_, y_, y_max_, frame) |
|
|
|
|
|
|
|
|
|
|
|
y = min(y, y_) |
|
|
y_max = max(y_max, y_max_) |
|
|
x = min(x, x_) |
|
|
x_max = max(x_max, x_max_) |
|
|
|
|
|
|
|
|
if ((i+1) % clip_length == 0) or (i==n_frame-1): |
|
|
x, x_max, y, y_max = bbox_div2(x, x_max, y, y_max) |
|
|
if x>=x_max or y>=y_max: |
|
|
x, x_max, y, y_max = 0, frame.shape[1]-1, 0, frame.shape[0]-1 |
|
|
|
|
|
bbox_clip.append([x, x_max, y, y_max]) |
|
|
x, x_max, y, y_max = init_bbox() |
|
|
|
|
|
frames_res = [] |
|
|
vid_res = [] |
|
|
bk_res = [] |
|
|
for i, pose_img in enumerate(pose_images): |
|
|
x, x_max, y, y_max = bbox_clip[i//clip_length] |
|
|
frame = np.array(pose_img) |
|
|
frame = frame[y:y_max, x:x_max] |
|
|
frame = Image.fromarray(frame) |
|
|
frames_res.append(frame) |
|
|
|
|
|
vid = vid_images[i] |
|
|
vid = np.array(vid) |
|
|
vid_res.append(Image.fromarray(vid[y:y_max, x:x_max])) |
|
|
|
|
|
bk = bk_images[i] |
|
|
bk = np.array(bk) |
|
|
bk_res.append(Image.fromarray(bk[y:y_max, x:x_max])) |
|
|
return frames_res, vid_res, bk_res, bbox_clip |
|
|
|
|
|
|
|
|
def init_bk(n_frame,h,w): |
|
|
images = [] |
|
|
for i in range(n_frame): |
|
|
img = np.ones((h, w, 3), dtype=np.uint8) * 255 |
|
|
images.append(Image.fromarray(img)) |
|
|
return images |
|
|
|
|
|
|
|
|
|
|
|
def pose_adjust(pose_image, width=512, height=784): |
|
|
canvas = np.zeros((height, width, 3), dtype=np.uint8) |
|
|
|
|
|
pose_img = np.array(pose_image) |
|
|
h, w, c = pose_img.shape |
|
|
|
|
|
|
|
|
|
|
|
nh, nw = height, int(w * height / h) |
|
|
pose_img = cv2.resize(pose_img, (nw, nh), interpolation=cv2.INTER_AREA) |
|
|
if nw < width: |
|
|
|
|
|
pad = (width - nw) // 2 |
|
|
canvas[:, pad:pad + nw, :] = pose_img |
|
|
else: |
|
|
|
|
|
crop = (nw - width) // 2 |
|
|
canvas = pose_img[:, crop:crop + width, :] |
|
|
|
|
|
|
|
|
canvas = Image.fromarray(canvas) |
|
|
return canvas |
|
|
|
|
|
|
|
|
def load_pretrain_pose_guider(model, ckpt_path): |
|
|
|
|
|
state_dict = torch.load(ckpt_path, map_location="cpu") |
|
|
|
|
|
|
|
|
|
|
|
weights = state_dict['conv_in.weight'] |
|
|
|
|
|
|
|
|
weights = torch.cat((weights, torch.zeros_like(weights), torch.zeros_like(weights)), dim=1) |
|
|
state_dict['conv_in.weight'] = weights |
|
|
|
|
|
model.load_state_dict(state_dict, strict=True) |
|
|
|
|
|
return model |
|
|
|
|
|
def refine_img_prepross(image, mask): |
|
|
im_ary = np.asarray(image).astype(np.float32) |
|
|
input = np.concatenate([im_ary, mask[:, :, np.newaxis]], axis=-1) |
|
|
return input |
|
|
|
|
|
mask_mode = {'up_down_left_right': 0, 'left_right_up': 1, 'left_right_down': 2, 'up_down_left': 3, 'up_down_right': 4, |
|
|
'left_right': 5, 'up_down': 6, 'left_up': 7, 'right_up': 8, 'left_down': 9, 'right_down': 10, |
|
|
'left': 11, 'right': 12, 'up': 13, 'down': 14, 'inner': 15} |
|
|
|
|
|
def get_mask(mask_list, bbox, img): |
|
|
w, h = img.size |
|
|
|
|
|
|
|
|
w_min, w_max, h_min, h_max = bbox |
|
|
if w_min<=0 and w_max>=w and h_min<=0 and h_max>=h: |
|
|
mode = 'up_down_left_right' |
|
|
elif w_min<=0 and w_max>=w and h_min<=0: |
|
|
mode = 'left_right_up' |
|
|
elif w_min<=0 and w_max>=w and h_max>=h: |
|
|
mode = 'left_right_down' |
|
|
elif w_min <= 0 and h_min <= 0 and h_max >= h: |
|
|
mode = 'up_down_left' |
|
|
elif w_max >= w and h_min <= 0 and h_max >= h: |
|
|
mode = 'up_down_right' |
|
|
|
|
|
elif w_min<=0 and w_max>=w: |
|
|
mode = 'left_right' |
|
|
elif h_min<=0 and h_max>=h: |
|
|
mode = 'up_down' |
|
|
elif w_min<=0 and h_min<=0: |
|
|
mode = 'left_up' |
|
|
elif w_max>=w and h_min<=0: |
|
|
mode = 'right_up' |
|
|
elif w_min<=0 and h_max>=h: |
|
|
mode = 'left_down' |
|
|
elif w_max>=w and h_max>=h: |
|
|
mode = 'right_down' |
|
|
|
|
|
elif w_min<=0: |
|
|
mode = 'left' |
|
|
elif w_max>=w: |
|
|
mode = 'right' |
|
|
elif h_min<=0: |
|
|
mode = 'up' |
|
|
elif h_max>=h: |
|
|
mode = 'down' |
|
|
else: |
|
|
mode = 'inner' |
|
|
|
|
|
mask = mask_list[mask_mode[mode]] |
|
|
|
|
|
return mask |
|
|
|
|
|
def load_mask_list(mask_path): |
|
|
mask_list = [] |
|
|
for key in mask_mode.keys(): |
|
|
mask = cv2.imread(mask_path[:-4] + '_%s.png'%key) |
|
|
mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0 |
|
|
mask_list.append(mask) |
|
|
return mask_list |
|
|
|
|
|
def recover_bk(images, start_idx, end_idx, template_name=None): |
|
|
img = np.array(images[0]) |
|
|
for i in range(start_idx, end_idx): |
|
|
if template_name == "dance_indoor_1": |
|
|
images[i][:img.shape[0], :, 0] = 255 |
|
|
images[i][:img.shape[0], :, 1] = 255 |
|
|
images[i][:img.shape[0], :, 2] = 255 |
|
|
else: |
|
|
img_blank = np.ones_like(img) * 255 |
|
|
images[i] = Image.fromarray(img_blank) |
|
|
return images |
|
|
|
|
|
|
|
|
def load_video_fixed_fps(vid_path, target_fps=30, target_speed=1): |
|
|
|
|
|
reader = imageio.get_reader(vid_path) |
|
|
fps = round(reader.get_meta_data()['fps']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
keep_ratio = target_speed * fps / target_fps |
|
|
n_frames = reader.count_frames() |
|
|
keep_frames_indices = np.arange(0, n_frames, keep_ratio).astype(int) |
|
|
|
|
|
|
|
|
frames = [Image.fromarray(reader.get_data(i)) for i in keep_frames_indices if i < len(reader)] |
|
|
|
|
|
reader.close() |
|
|
return frames |
|
|
|
|
|
|