Shawon16 commited on
Commit
b25c835
·
verified ·
1 Parent(s): 5e595d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -77
app.py CHANGED
@@ -9,16 +9,10 @@ from pytorchvideo.transforms import (
9
  )
10
  from torchvision.transforms import Compose, Lambda, Resize
11
  from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
12
- from matplotlib import pyplot as plt
13
 
14
-
15
- from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
16
-
17
-
18
- # make sure to define a model here not a dataset
19
- MODEL_CKPT = "Shawon16/VideoMAE_BdSLW401_20_epochs_p5_SR_10" # BdSLW401
20
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
-
22
  MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
23
  PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
24
 
@@ -35,103 +29,66 @@ VAL_TRANSFORMS = Compose(
35
  )
36
  LABELS = list(MODEL.config.label2id.keys())
37
 
38
-
39
  def parse_video(video_file):
40
- """A utility to parse the input videos.
41
- Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
42
- """
43
  vs = cv2.VideoCapture(video_file)
44
-
45
- # try to determine the total number of frames in the video file
46
- try:
47
- prop = (
48
- cv2.cv.CV_CAP_PROP_FRAME_COUNT
49
- if imutils.is_cv2()
50
- else cv2.CAP_PROP_FRAME_COUNT
51
- )
52
- total = int(vs.get(prop))
53
- print("[INFO] {} total frames in video".format(total))
54
-
55
- # an error occurred while trying to determine the total
56
- # number of frames in the video file
57
- except:
58
- print("[INFO] could not determine # of frames in video")
59
- print("[INFO] no approx. completion time can be provided")
60
- total = -1
61
-
62
  frames = []
63
-
64
- # loop over frames from the video file stream
65
  while True:
66
- # read the next frame from the file
67
- (grabbed, frame) = vs.read()
68
- if frame is not None:
69
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
70
- frames.append(frame)
71
- # if the frame was not grabbed, then we have reached the end
72
- # of the stream
73
  if not grabbed:
74
  break
 
 
 
 
75
 
 
 
 
76
  return frames
77
 
78
-
79
- def preprocess_video(frames: list):
80
- """Utility to apply preprocessing transformations to a video tensor."""
81
- # Each frame in the `frames` list has the shape: (height, width, num_channels).
82
- # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
83
- # So, after converting the `frames` list to a torch tensor, we permute the shape
84
- # such that it becomes (num_channels, num_frames, height, width) to make
85
- # the shape compatible with the preprocessing transformations. After applying the
86
- # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
87
- # to make it compatible with the model. Finally, we add a batch dimension so that our video
88
- # classification model can operate on it.
89
  video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
90
- video_tensor = video_tensor.permute(
91
- 3, 0, 1, 2
92
- ) # (num_channels, num_frames, height, width)
93
  video_tensor_pp = VAL_TRANSFORMS(video_tensor)
94
- video_tensor_pp = video_tensor_pp.permute(
95
- 1, 0, 2, 3
96
- ) # (num_frames, num_channels, height, width)
97
- video_tensor_pp = video_tensor_pp.unsqueeze(0)
98
  return video_tensor_pp.to(DEVICE)
99
 
100
-
101
  def infer(video_file):
102
  frames = parse_video(video_file)
103
  video_tensor = preprocess_video(frames)
104
  inputs = {"pixel_values": video_tensor}
105
 
106
- # forward pass
107
  with torch.no_grad():
108
  outputs = MODEL(**inputs)
109
  logits = outputs.logits
110
  softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
111
  confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
112
- return confidences
113
 
 
114
 
 
115
  gr.Interface(
116
  fn=infer,
117
- #inputs=gr.Video(),
118
- inputs=gr.File(file_types=["video"]),
119
- outputs=gr.Label(num_top_classes=5),
 
 
120
  examples=[
121
- #['/media/cse/HDD/Shawon/shawon/MY DATA/15% split train-val frame rate corrected/test/aam/U4W37F_trial_0_R.mp4'],
122
- #["/kaggle/input/mini-dataset-10-class/mini dataset 10 class/aam/U12W37F_trial_2.mp4"],
123
-
124
  ],
125
- title="LLM Finetuned on BdSLW60 -- A Word Level Bangla Sign Language Dataset",
126
  description=(
127
- "Gradio demo for BdSLW60 Word Level Dataset Classification. To use it, simply upload your video or click one of the"
128
- " examples to load them. Read more at the links below."
129
  ),
130
- article=(
131
- "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
132
- " <center><a href='https://huggingface.co/Shawon16/VideoMAE_BdSLW60_FrameRateCorrected_withoutAug_20epoch_batch8' target='_blank'>Fine-tuned Model</a></center></div>"
133
- ),
134
- flagging_mode="never"
135
- ,
136
-
137
- ).launch() # share=true for public link
 
9
  )
10
  from torchvision.transforms import Compose, Lambda, Resize
11
  from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
 
12
 
13
+ # Load model and processor
14
+ MODEL_CKPT = "Shawon16/VideoMAE_BdSLW401_20_epochs_p5_SR_10"
 
 
 
 
15
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
16
  MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
17
  PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
18
 
 
29
  )
30
  LABELS = list(MODEL.config.label2id.keys())
31
 
 
32
  def parse_video(video_file):
33
+ """Extract frames from a video file with a sample rate of 10."""
 
 
34
  vs = cv2.VideoCapture(video_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  frames = []
36
+ frame_id = 0
37
+
38
  while True:
39
+ grabbed, frame = vs.read()
 
 
 
 
 
 
40
  if not grabbed:
41
  break
42
+
43
+ if frame_id % 10 == 0: # Sample every 10th frame
44
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45
+ frames.append(frame)
46
 
47
+ frame_id += 1
48
+
49
+ vs.release()
50
  return frames
51
 
52
+ def preprocess_video(frames):
53
+ """Preprocess video frames for inference."""
 
 
 
 
 
 
 
 
 
54
  video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
55
+ video_tensor = video_tensor.permute(3, 0, 1, 2) # (num_channels, num_frames, height, width)
 
 
56
  video_tensor_pp = VAL_TRANSFORMS(video_tensor)
57
+ video_tensor_pp = video_tensor_pp.permute(1, 0, 2, 3) # (num_frames, num_channels, height, width)
58
+ video_tensor_pp = video_tensor_pp.unsqueeze(0) # Add batch dimension
 
 
59
  return video_tensor_pp.to(DEVICE)
60
 
 
61
  def infer(video_file):
62
  frames = parse_video(video_file)
63
  video_tensor = preprocess_video(frames)
64
  inputs = {"pixel_values": video_tensor}
65
 
66
+ # Forward pass
67
  with torch.no_grad():
68
  outputs = MODEL(**inputs)
69
  logits = outputs.logits
70
  softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
71
  confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
 
72
 
73
+ return confidences, frames # Remove confidence plot
74
 
75
+ # Gradio Interface with Video Upload Only
76
  gr.Interface(
77
  fn=infer,
78
+ inputs=[gr.Video(label="Upload Video")],
79
+ outputs=[
80
+ gr.Label(num_top_classes=5, label="Top 5 Predictions"),
81
+ gr.Gallery(label="Sampled Frames (Rate: 10)", columns=5, height="300px"),
82
+ ],
83
  examples=[
84
+ [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W1\U4W1F_trial_4_L.mp4"],
85
+ [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W2\U8W2F_trial_6_R.mp4"],
86
+ [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W20\U4W20F_trial_9_R.mp4"],
87
  ],
88
+ title="Bangla Word Level (BdSLW401) Sign Language Recognition Interface",
89
  description=(
90
+ "This system uses a fine-tuned videoLLM (videoMAE) to classify Bangla Sign Language words from video inputs."
91
+ " Upload a video for predictions."
92
  ),
93
+ flagging_mode="never",
94
+ ).launch()