Hi, I'm trying to build a custom loader to stream in video data from S3 for a diarization task (based on the audio.manual recipe. I've based my approach on this answer.
I've adjusted it slightly by loading the videos from a signed_url which should behave exactly like a public URL. However, whilst the videos are loading the audio wav form is blank (see below). Since this is a diarization task, the wav form is critical.
Fig. 1: Wav form is missing.
This my current custom stream generator:
import boto3
from config import Config
import re
from botocore import client
from datetime import datetime
class S3Service(object):
    def __init__(
            self,
    ):
        self.bucket = Config.bucket_name
        self.s3 = self.get_s3()
    @staticmethod
    def get_s3():
        s3 = boto3.client(
            's3',
            aws_access_key_id=Config.aws_access_key_id,
            aws_secret_access_key=Config.aws_secret_access_key,
            config=client.Config(signature_version='s3v4')
        )
        return s3
    @staticmethod
    def get_s3_direct_file_regex(
            prefix
    ):
        if not prefix.endswith('/'):
            prefix += '/'
        escaped_subdir = re.escape(prefix)
        pattern = rf'^{escaped_subdir}[^/]+$'
        return re.compile(pattern)
    def generate_signed_url(
            self,
            object_key,
    ) -> str:
        signed_url = self.s3.generate_presigned_url(
            ClientMethod='get_object',
            Params={
                'Bucket': self.bucket,
                'Key': object_key,
            },
            ExpiresIn=Config.expires_in
        )
        return signed_url
    def stream_from_s3(
            self,
            file_type,
            prefix=None,
    ):
        paginator = self.s3.get_paginator('list_objects')
        paginate_params = {
            'Bucket': self.bucket
        }
        if prefix is not None:
            paginate_params['Prefix'] = prefix
        page_iterator = paginator.paginate(**paginate_params)
        pattern = self.get_s3_direct_file_regex(prefix)
        for page in page_iterator:
            for obj in page['Contents']:
                if pattern.match(obj.get("Key")):
                    object_key = obj['Key']
                    signed_url = self.generate_signed_url(
                        object_key,
                    )
                    annotation_element = {
                        file_type: signed_url,
                        'meta': {
                            "s3_object_key": f"s3://{self.bucket}/{object_key}",
                            "time_stamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        }
                    }
                    yield annotation_element
This is called directly in a custom recipe as follows:
stream = s3_service.stream_from_s3(file_type=file_type, prefix=prefix)
Any advice to get the wav form to display from a URL? I've also experimented with reading the video and converting to base64 with both python native and prodigy converters, but such approaches have just hung indefinitely. Any help appreciated!
