Spaces:

Shawon16
/

BdSLW60

Runtime error

App Files Files Community

Shawon16 commited on Feb 21

Commit

b25c835

verified ·

1 Parent(s): 5e595d8

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -77

app.py CHANGED Viewed

@@ -9,16 +9,10 @@ from pytorchvideo.transforms import (
 )
 from torchvision.transforms import Compose, Lambda, Resize
 from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-from matplotlib import pyplot as plt
-from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-# make sure to define a model here not a dataset
-MODEL_CKPT = "Shawon16/VideoMAE_BdSLW401_20_epochs_p5_SR_10" # BdSLW401
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
 PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
@@ -35,103 +29,66 @@ VAL_TRANSFORMS = Compose(
 )
 LABELS = list(MODEL.config.label2id.keys())
 def parse_video(video_file):
-    """A utility to parse the input videos.
-    Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
-    """
     vs = cv2.VideoCapture(video_file)
-    # try to determine the total number of frames in the video file
-    try:
-        prop = (
-            cv2.cv.CV_CAP_PROP_FRAME_COUNT
-            if imutils.is_cv2()
-            else cv2.CAP_PROP_FRAME_COUNT
-        )
-        total = int(vs.get(prop))
-        print("[INFO] {} total frames in video".format(total))
-    # an error occurred while trying to determine the total
-    # number of frames in the video file
-    except:
-        print("[INFO] could not determine # of frames in video")
-        print("[INFO] no approx. completion time can be provided")
-        total = -1
     frames = []
-    # loop over frames from the video file stream
     while True:
-        # read the next frame from the file
-        (grabbed, frame) = vs.read()
-        if frame is not None:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frames.append(frame)
-        # if the frame was not grabbed, then we have reached the end
-        # of the stream
         if not grabbed:
             break
     return frames
-def preprocess_video(frames: list):
-    """Utility to apply preprocessing transformations to a video tensor."""
-    # Each frame in the `frames` list has the shape: (height, width, num_channels).
-    # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
-    # So, after converting the `frames` list to a torch tensor, we permute the shape
-    # such that it becomes (num_channels, num_frames, height, width) to make
-    # the shape compatible with the preprocessing transformations. After applying the
-    # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
-    # to make it compatible with the model. Finally, we add a batch dimension so that our video
-    # classification model can operate on it.
     video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
-    video_tensor = video_tensor.permute(
-        3, 0, 1, 2
-    )  # (num_channels, num_frames, height, width)
     video_tensor_pp = VAL_TRANSFORMS(video_tensor)
-    video_tensor_pp = video_tensor_pp.permute(
-        1, 0, 2, 3
-    )  # (num_frames, num_channels, height, width)
-    video_tensor_pp = video_tensor_pp.unsqueeze(0)
     return video_tensor_pp.to(DEVICE)
 def infer(video_file):
     frames = parse_video(video_file)
     video_tensor = preprocess_video(frames)
     inputs = {"pixel_values": video_tensor}
-    # forward pass
     with torch.no_grad():
         outputs = MODEL(**inputs)
         logits = outputs.logits
     softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
     confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
-    return confidences
 gr.Interface(
     fn=infer,
-    #inputs=gr.Video(),
-    inputs=gr.File(file_types=["video"]),
-    outputs=gr.Label(num_top_classes=5),
     examples=[
-        #['/media/cse/HDD/Shawon/shawon/MY DATA/15% split train-val frame rate corrected/test/aam/U4W37F_trial_0_R.mp4'],
-        #["/kaggle/input/mini-dataset-10-class/mini dataset 10 class/aam/U12W37F_trial_2.mp4"],
     ],
-    title="LLM Finetuned on BdSLW60 -- A Word Level Bangla Sign Language Dataset",
     description=(
-        "Gradio demo for BdSLW60 Word Level Dataset Classification. To use it, simply upload your video or click one of the"
-        " examples to load them. Read more at the links below."
     ),
-    article=(
-        "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
-        " <center><a href='https://huggingface.co/Shawon16/VideoMAE_BdSLW60_FrameRateCorrected_withoutAug_20epoch_batch8' target='_blank'>Fine-tuned Model</a></center></div>"
-    ),
-    flagging_mode="never"
-,
-).launch() # share=true for public link

 )
 from torchvision.transforms import Compose, Lambda, Resize
 from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
+# Load model and processor
+MODEL_CKPT = "Shawon16/VideoMAE_BdSLW401_20_epochs_p5_SR_10"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
 PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
 )
 LABELS = list(MODEL.config.label2id.keys())
 def parse_video(video_file):
+    """Extract frames from a video file with a sample rate of 10."""
     vs = cv2.VideoCapture(video_file)
     frames = []
+    frame_id = 0
     while True:
+        grabbed, frame = vs.read()
         if not grabbed:
             break
+        if frame_id % 10 == 0:  # Sample every 10th frame
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame)
+        frame_id += 1
+    vs.release()
     return frames
+def preprocess_video(frames):
+    """Preprocess video frames for inference."""
     video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
+    video_tensor = video_tensor.permute(3, 0, 1, 2)  # (num_channels, num_frames, height, width)
     video_tensor_pp = VAL_TRANSFORMS(video_tensor)
+    video_tensor_pp = video_tensor_pp.permute(1, 0, 2, 3)  # (num_frames, num_channels, height, width)
+    video_tensor_pp = video_tensor_pp.unsqueeze(0)  # Add batch dimension
     return video_tensor_pp.to(DEVICE)
 def infer(video_file):
     frames = parse_video(video_file)
     video_tensor = preprocess_video(frames)
     inputs = {"pixel_values": video_tensor}
+    # Forward pass
     with torch.no_grad():
         outputs = MODEL(**inputs)
         logits = outputs.logits
     softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
     confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
+    return confidences, frames  # Remove confidence plot
+# Gradio Interface with Video Upload Only
 gr.Interface(
     fn=infer,
+    inputs=[gr.Video(label="Upload Video")],
+    outputs=[
+        gr.Label(num_top_classes=5, label="Top 5 Predictions"),
+        gr.Gallery(label="Sampled Frames (Rate: 10)", columns=5, height="300px"),
+    ],
     examples=[
+        [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W1\U4W1F_trial_4_L.mp4"],
+        [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W2\U8W2F_trial_6_R.mp4"],
+        [r"C:\Users\shawo\Desktop\BdSLW60 Full DataSet\FrameRate Corrected Clips\W20\U4W20F_trial_9_R.mp4"],
     ],
+    title="Bangla Word Level (BdSLW401) Sign Language Recognition Interface",
     description=(
+        "This system uses a fine-tuned videoLLM (videoMAE) to classify Bangla Sign Language words from video inputs."
+        " Upload a video for predictions."
     ),
+    flagging_mode="never",
+).launch()