Hi, I'm trying to build a custom loader to stream in video data from S3 for a diarization task (based on the audio.manual
recipe. I've based my approach on this answer.
I've adjusted it slightly by loading the videos from a signed_url
which should behave exactly like a public URL. However, whilst the videos are loading the audio wav form is blank (see below). Since this is a diarization task, the wav form is critical.
Fig. 1: Wav form is missing.
This my current custom stream generator:
import boto3
from config import Config
import re
from botocore import client
from datetime import datetime
class S3Service(object):
def __init__(
self,
):
self.bucket = Config.bucket_name
self.s3 = self.get_s3()
@staticmethod
def get_s3():
s3 = boto3.client(
's3',
aws_access_key_id=Config.aws_access_key_id,
aws_secret_access_key=Config.aws_secret_access_key,
config=client.Config(signature_version='s3v4')
)
return s3
@staticmethod
def get_s3_direct_file_regex(
prefix
):
if not prefix.endswith('/'):
prefix += '/'
escaped_subdir = re.escape(prefix)
pattern = rf'^{escaped_subdir}[^/]+$'
return re.compile(pattern)
def generate_signed_url(
self,
object_key,
) -> str:
signed_url = self.s3.generate_presigned_url(
ClientMethod='get_object',
Params={
'Bucket': self.bucket,
'Key': object_key,
},
ExpiresIn=Config.expires_in
)
return signed_url
def stream_from_s3(
self,
file_type,
prefix=None,
):
paginator = self.s3.get_paginator('list_objects')
paginate_params = {
'Bucket': self.bucket
}
if prefix is not None:
paginate_params['Prefix'] = prefix
page_iterator = paginator.paginate(**paginate_params)
pattern = self.get_s3_direct_file_regex(prefix)
for page in page_iterator:
for obj in page['Contents']:
if pattern.match(obj.get("Key")):
object_key = obj['Key']
signed_url = self.generate_signed_url(
object_key,
)
annotation_element = {
file_type: signed_url,
'meta': {
"s3_object_key": f"s3://{self.bucket}/{object_key}",
"time_stamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
}
}
yield annotation_element
This is called directly in a custom recipe as follows:
stream = s3_service.stream_from_s3(file_type=file_type, prefix=prefix)
Any advice to get the wav form to display from a URL? I've also experimented with reading the video and converting to base64
with both python
native and prodigy
converters, but such approaches have just hung indefinitely. Any help appreciated!