Spaces:
Build error
Build error
Frank Pacini
commited on
Commit
·
6155c0e
1
Parent(s):
e694ec3
copy repo
Browse files- CustomFile.py +19 -0
- README.md +4 -3
- app.py +35 -0
- audio_feature_extraction_final.py +125 -0
- ava_action_list.pbtxt +240 -0
- coco.names +80 -0
- environment.yml +5 -0
- requirements.txt +28 -0
- slowfast.py +191 -0
- video_object_extraction.py +185 -0
- visualization.py +706 -0
- yolov3.cfg +789 -0
CustomFile.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
# from typing import Dict
|
| 3 |
+
# import base64
|
| 4 |
+
|
| 5 |
+
# def encode_file_to_base64(f):
|
| 6 |
+
# with open(f, "rb") as file:
|
| 7 |
+
# encoded_string = base64.b64encode(file.read())
|
| 8 |
+
# base64_str = str(encoded_string, "utf-8")
|
| 9 |
+
# return base64_str
|
| 10 |
+
|
| 11 |
+
class CustomFile(gr.File):
|
| 12 |
+
# def postprocess(self, y: str) -> Dict:
|
| 13 |
+
# res = super().postprocess(y)
|
| 14 |
+
# if res is not None:
|
| 15 |
+
# res['data'] = encode_file_to_base64(res['name'])
|
| 16 |
+
# return res
|
| 17 |
+
def dummy(self):
|
| 18 |
+
return
|
| 19 |
+
|
README.md
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: yellow
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.12.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Fall2022 Videoanalysis
|
| 3 |
+
emoji: 📈
|
| 4 |
colorFrom: yellow
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.12.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from slowfast import slow_fast_train
|
| 4 |
+
from video_object_extraction import video_object_extraction
|
| 5 |
+
from audio_feature_extraction_final import audio_feature_extraction
|
| 6 |
+
from CustomFile import CustomFile
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import pickle
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import detectron2
|
| 15 |
+
except:
|
| 16 |
+
import os
|
| 17 |
+
os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def predict(video_path, frames):
|
| 21 |
+
# gpu = torch.cuda.is_available()
|
| 22 |
+
# video_1, df1 = slow_fast_train(video_path, gpu)
|
| 23 |
+
# video_2, df2 = video_object_extraction(video_path,frames)
|
| 24 |
+
# audio_path = audio_feature_extraction(video_path, gpu)
|
| 25 |
+
# return ([video_1, video_2,audio_path], df1, df2)
|
| 26 |
+
audio_features = np.random.rand(2,2)
|
| 27 |
+
audio_path = 'audio_embeddings'
|
| 28 |
+
with open(audio_path, 'wb') as f:
|
| 29 |
+
pickle.dump(audio_features, f)
|
| 30 |
+
df = pd.DataFrame()
|
| 31 |
+
return ([video_path, video_path, audio_path], df, df)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
iface = gr.Interface(predict, inputs= [gr.Video(),gr.Slider(1, 100, value=15)], outputs=[gr.File(), gr.Dataframe(max_rows = 10),gr.Dataframe(max_rows = 10)])
|
| 35 |
+
iface.launch(show_error=True, debug=True)
|
audio_feature_extraction_final.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torchaudio import load as torchaudio_load
|
| 3 |
+
from moviepy.editor import VideoFileClip
|
| 4 |
+
|
| 5 |
+
from pyannote.audio import Pipeline
|
| 6 |
+
from sklearn.preprocessing import LabelEncoder
|
| 7 |
+
from librosa import load as librosa_load
|
| 8 |
+
import librosa.display
|
| 9 |
+
import math
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
import numpy as np
|
| 15 |
+
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, pipeline as transformers_pipeline
|
| 16 |
+
import pickle
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
""""Author: Frank"""
|
| 20 |
+
def extract_s2t_features(gpu):
|
| 21 |
+
model_name="medium"
|
| 22 |
+
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-{}-librispeech-asr".format(model_name))
|
| 23 |
+
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-{}-librispeech-asr".format(model_name))
|
| 24 |
+
if gpu:
|
| 25 |
+
model = model.cuda()
|
| 26 |
+
model.load_state_dict(torch.load('s2t_model'))
|
| 27 |
+
model.eval()
|
| 28 |
+
|
| 29 |
+
sample_rate = 16000
|
| 30 |
+
embedding_window = 10 # in secs
|
| 31 |
+
|
| 32 |
+
audio, _ = torchaudio_load('temp.wav')
|
| 33 |
+
audio = torch.mean(audio, dim=0)
|
| 34 |
+
|
| 35 |
+
embs = []
|
| 36 |
+
audio_clips = audio.split(embedding_window*sample_rate)
|
| 37 |
+
if len(audio_clips) > 1:
|
| 38 |
+
audio_clips = audio_clips[:-1]
|
| 39 |
+
for clip in tqdm(audio_clips):
|
| 40 |
+
with torch.no_grad():
|
| 41 |
+
inputs = processor(clip, sampling_rate=16000, return_tensors="pt")
|
| 42 |
+
features = inputs["input_features"]
|
| 43 |
+
decoder_input = torch.zeros(features.shape[:2], dtype=torch.int32)
|
| 44 |
+
if gpu:
|
| 45 |
+
features, decoder_input = features.cuda(), decoder_input.cuda()
|
| 46 |
+
|
| 47 |
+
h = model.model(features, decoder_input_ids=decoder_input).last_hidden_state.cpu()
|
| 48 |
+
emb = torch.mean(h,axis=1)
|
| 49 |
+
embs.append(emb)
|
| 50 |
+
return torch.cat(embs).numpy()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
""""Author: Sichao"""
|
| 54 |
+
def extract_speaker_features(gpu):
|
| 55 |
+
x , sample_rate = librosa_load('temp.wav')
|
| 56 |
+
print('Input sample rate: {}, Length: {} s'.format(sample_rate, x.size/sample_rate))
|
| 57 |
+
|
| 58 |
+
# speaker diarization
|
| 59 |
+
print('Start speaker diarization...')
|
| 60 |
+
pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token='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')
|
| 61 |
+
diarization = pipeline('temp.wav')
|
| 62 |
+
speaker_per_sec_dict = {i: 'UNKNOWN' for i in range(0, math.ceil(x.size/sample_rate))}
|
| 63 |
+
|
| 64 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 65 |
+
for clip_start in range(math.ceil(turn.start), math.ceil(turn.end)):
|
| 66 |
+
if speaker_per_sec_dict[clip_start] == 'UNKNOWN':
|
| 67 |
+
speaker_per_sec_dict[clip_start] = speaker
|
| 68 |
+
elif speaker_per_sec_dict[clip_start] != speaker:
|
| 69 |
+
speaker_per_sec_dict[clip_start] = speaker_per_sec_dict[clip_start] + ' ' + speaker
|
| 70 |
+
|
| 71 |
+
speaker_per_clip = []
|
| 72 |
+
for i in range(0, math.ceil(x.size/sample_rate), 10):
|
| 73 |
+
speakers = []
|
| 74 |
+
for j in range(10):
|
| 75 |
+
if i + j in speaker_per_sec_dict and speaker_per_sec_dict[i + j] != 'UNKNOWN':
|
| 76 |
+
speakers.append(speaker_per_sec_dict[i + j])
|
| 77 |
+
if len(speakers) > 0:
|
| 78 |
+
is_single_speaker = all(s == speakers[0] for s in speakers)
|
| 79 |
+
if is_single_speaker:
|
| 80 |
+
speaker_per_clip.append(speakers[0])
|
| 81 |
+
else:
|
| 82 |
+
speaker_per_clip.append('MULTI SPEAKER')
|
| 83 |
+
else:
|
| 84 |
+
speaker_per_clip.append('UNKNOWN')
|
| 85 |
+
|
| 86 |
+
# Adult child classification
|
| 87 |
+
print('Start adult child classification...')
|
| 88 |
+
device = 0 if gpu else -1
|
| 89 |
+
audio_classifier = transformers_pipeline(task="audio-classification", model="bookbot/wav2vec2-adult-child-cls", device=device)
|
| 90 |
+
clip_idxs = [i for i in range(0, math.ceil(x.size/sample_rate), 10)]
|
| 91 |
+
classifications = []
|
| 92 |
+
for clip_start in tqdm(clip_idxs):
|
| 93 |
+
with torch.no_grad():
|
| 94 |
+
preds = audio_classifier(x[clip_start*sample_rate:(clip_start + 10)*sample_rate])
|
| 95 |
+
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
|
| 96 |
+
classifications.append(preds[0]['label'])
|
| 97 |
+
|
| 98 |
+
# output
|
| 99 |
+
print('Output...')
|
| 100 |
+
output = {'clip_start':clip_idxs, 'diarization':speaker_per_clip, 'adult_child_classification':classifications}
|
| 101 |
+
output_df = pd.DataFrame(output)
|
| 102 |
+
# Creating a instance of label Encoder.
|
| 103 |
+
le = LabelEncoder()
|
| 104 |
+
|
| 105 |
+
# encoder and return encoded label
|
| 106 |
+
output_df['diarization_numeric'] = le.fit_transform(output_df['diarization'])
|
| 107 |
+
output_df['adult_child_classification_numeric'] = le.fit_transform(output_df['adult_child_classification'])
|
| 108 |
+
return output_df['diarization_numeric'].values, output_df['adult_child_classification_numeric'].values
|
| 109 |
+
|
| 110 |
+
def audio_feature_extraction(input_path, gpu=False):
|
| 111 |
+
output_path = 'audio_embedding'
|
| 112 |
+
audioTrack = VideoFileClip(input_path).audio
|
| 113 |
+
audioTrack.write_audiofile('temp.wav', codec='pcm_s16le', fps=16000)
|
| 114 |
+
|
| 115 |
+
print('Extracting s2t features...')
|
| 116 |
+
s2t_features = extract_s2t_features(gpu)
|
| 117 |
+
print('Extracting speaker features...')
|
| 118 |
+
diarization_features, adult_child_class_features = extract_speaker_features(gpu)
|
| 119 |
+
|
| 120 |
+
if len(diarization_features) > 1:
|
| 121 |
+
diarization_features, adult_child_class_features = diarization_features[:-1], adult_child_class_features[:-1]
|
| 122 |
+
audio_features = np.concatenate((s2t_features, diarization_features[:, None], adult_child_class_features[:, None]), axis=1)
|
| 123 |
+
with open(output_path, 'wb') as f:
|
| 124 |
+
pickle.dump(audio_features, f)
|
| 125 |
+
return output_path
|
ava_action_list.pbtxt
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
item {
|
| 2 |
+
name: "bend/bow (at the waist)"
|
| 3 |
+
id: 1
|
| 4 |
+
}
|
| 5 |
+
item {
|
| 6 |
+
name: "crouch/kneel"
|
| 7 |
+
id: 3
|
| 8 |
+
}
|
| 9 |
+
item {
|
| 10 |
+
name: "dance"
|
| 11 |
+
id: 4
|
| 12 |
+
}
|
| 13 |
+
item {
|
| 14 |
+
name: "fall down"
|
| 15 |
+
id: 5
|
| 16 |
+
}
|
| 17 |
+
item {
|
| 18 |
+
name: "get up"
|
| 19 |
+
id: 6
|
| 20 |
+
}
|
| 21 |
+
item {
|
| 22 |
+
name: "jump/leap"
|
| 23 |
+
id: 7
|
| 24 |
+
}
|
| 25 |
+
item {
|
| 26 |
+
name: "lie/sleep"
|
| 27 |
+
id: 8
|
| 28 |
+
}
|
| 29 |
+
item {
|
| 30 |
+
name: "martial art"
|
| 31 |
+
id: 9
|
| 32 |
+
}
|
| 33 |
+
item {
|
| 34 |
+
name: "run/jog"
|
| 35 |
+
id: 10
|
| 36 |
+
}
|
| 37 |
+
item {
|
| 38 |
+
name: "sit"
|
| 39 |
+
id: 11
|
| 40 |
+
}
|
| 41 |
+
item {
|
| 42 |
+
name: "stand"
|
| 43 |
+
id: 12
|
| 44 |
+
}
|
| 45 |
+
item {
|
| 46 |
+
name: "swim"
|
| 47 |
+
id: 13
|
| 48 |
+
}
|
| 49 |
+
item {
|
| 50 |
+
name: "walk"
|
| 51 |
+
id: 14
|
| 52 |
+
}
|
| 53 |
+
item {
|
| 54 |
+
name: "answer phone"
|
| 55 |
+
id: 15
|
| 56 |
+
}
|
| 57 |
+
item {
|
| 58 |
+
name: "carry/hold (an object)"
|
| 59 |
+
id: 17
|
| 60 |
+
}
|
| 61 |
+
item {
|
| 62 |
+
name: "climb (e.g., a mountain)"
|
| 63 |
+
id: 20
|
| 64 |
+
}
|
| 65 |
+
item {
|
| 66 |
+
name: "close (e.g., a door, a box)"
|
| 67 |
+
id: 22
|
| 68 |
+
}
|
| 69 |
+
item {
|
| 70 |
+
name: "cut"
|
| 71 |
+
id: 24
|
| 72 |
+
}
|
| 73 |
+
item {
|
| 74 |
+
name: "dress/put on clothing"
|
| 75 |
+
id: 26
|
| 76 |
+
}
|
| 77 |
+
item {
|
| 78 |
+
name: "drink"
|
| 79 |
+
id: 27
|
| 80 |
+
}
|
| 81 |
+
item {
|
| 82 |
+
name: "drive (e.g., a car, a truck)"
|
| 83 |
+
id: 28
|
| 84 |
+
}
|
| 85 |
+
item {
|
| 86 |
+
name: "eat"
|
| 87 |
+
id: 29
|
| 88 |
+
}
|
| 89 |
+
item {
|
| 90 |
+
name: "enter"
|
| 91 |
+
id: 30
|
| 92 |
+
}
|
| 93 |
+
item {
|
| 94 |
+
name: "hit (an object)"
|
| 95 |
+
id: 34
|
| 96 |
+
}
|
| 97 |
+
item {
|
| 98 |
+
name: "lift/pick up"
|
| 99 |
+
id: 36
|
| 100 |
+
}
|
| 101 |
+
item {
|
| 102 |
+
name: "listen (e.g., to music)"
|
| 103 |
+
id: 37
|
| 104 |
+
}
|
| 105 |
+
item {
|
| 106 |
+
name: "open (e.g., a window, a car door)"
|
| 107 |
+
id: 38
|
| 108 |
+
}
|
| 109 |
+
item {
|
| 110 |
+
name: "play musical instrument"
|
| 111 |
+
id: 41
|
| 112 |
+
}
|
| 113 |
+
item {
|
| 114 |
+
name: "point to (an object)"
|
| 115 |
+
id: 43
|
| 116 |
+
}
|
| 117 |
+
item {
|
| 118 |
+
name: "pull (an object)"
|
| 119 |
+
id: 45
|
| 120 |
+
}
|
| 121 |
+
item {
|
| 122 |
+
name: "push (an object)"
|
| 123 |
+
id: 46
|
| 124 |
+
}
|
| 125 |
+
item {
|
| 126 |
+
name: "put down"
|
| 127 |
+
id: 47
|
| 128 |
+
}
|
| 129 |
+
item {
|
| 130 |
+
name: "read"
|
| 131 |
+
id: 48
|
| 132 |
+
}
|
| 133 |
+
item {
|
| 134 |
+
name: "ride (e.g., a bike, a car, a horse)"
|
| 135 |
+
id: 49
|
| 136 |
+
}
|
| 137 |
+
item {
|
| 138 |
+
name: "sail boat"
|
| 139 |
+
id: 51
|
| 140 |
+
}
|
| 141 |
+
item {
|
| 142 |
+
name: "shoot"
|
| 143 |
+
id: 52
|
| 144 |
+
}
|
| 145 |
+
item {
|
| 146 |
+
name: "smoke"
|
| 147 |
+
id: 54
|
| 148 |
+
}
|
| 149 |
+
item {
|
| 150 |
+
name: "take a photo"
|
| 151 |
+
id: 56
|
| 152 |
+
}
|
| 153 |
+
item {
|
| 154 |
+
name: "text on/look at a cellphone"
|
| 155 |
+
id: 57
|
| 156 |
+
}
|
| 157 |
+
item {
|
| 158 |
+
name: "throw"
|
| 159 |
+
id: 58
|
| 160 |
+
}
|
| 161 |
+
item {
|
| 162 |
+
name: "touch (an object)"
|
| 163 |
+
id: 59
|
| 164 |
+
}
|
| 165 |
+
item {
|
| 166 |
+
name: "turn (e.g., a screwdriver)"
|
| 167 |
+
id: 60
|
| 168 |
+
}
|
| 169 |
+
item {
|
| 170 |
+
name: "watch (e.g., TV)"
|
| 171 |
+
id: 61
|
| 172 |
+
}
|
| 173 |
+
item {
|
| 174 |
+
name: "work on a computer"
|
| 175 |
+
id: 62
|
| 176 |
+
}
|
| 177 |
+
item {
|
| 178 |
+
name: "write"
|
| 179 |
+
id: 63
|
| 180 |
+
}
|
| 181 |
+
item {
|
| 182 |
+
name: "fight/hit (a person)"
|
| 183 |
+
id: 64
|
| 184 |
+
}
|
| 185 |
+
item {
|
| 186 |
+
name: "give/serve (an object) to (a person)"
|
| 187 |
+
id: 65
|
| 188 |
+
}
|
| 189 |
+
item {
|
| 190 |
+
name: "grab (a person)"
|
| 191 |
+
id: 66
|
| 192 |
+
}
|
| 193 |
+
item {
|
| 194 |
+
name: "hand clap"
|
| 195 |
+
id: 67
|
| 196 |
+
}
|
| 197 |
+
item {
|
| 198 |
+
name: "hand shake"
|
| 199 |
+
id: 68
|
| 200 |
+
}
|
| 201 |
+
item {
|
| 202 |
+
name: "hand wave"
|
| 203 |
+
id: 69
|
| 204 |
+
}
|
| 205 |
+
item {
|
| 206 |
+
name: "hug (a person)"
|
| 207 |
+
id: 70
|
| 208 |
+
}
|
| 209 |
+
item {
|
| 210 |
+
name: "kiss (a person)"
|
| 211 |
+
id: 72
|
| 212 |
+
}
|
| 213 |
+
item {
|
| 214 |
+
name: "lift (a person)"
|
| 215 |
+
id: 73
|
| 216 |
+
}
|
| 217 |
+
item {
|
| 218 |
+
name: "listen to (a person)"
|
| 219 |
+
id: 74
|
| 220 |
+
}
|
| 221 |
+
item {
|
| 222 |
+
name: "push (another person)"
|
| 223 |
+
id: 76
|
| 224 |
+
}
|
| 225 |
+
item {
|
| 226 |
+
name: "sing to (e.g., self, a person, a group)"
|
| 227 |
+
id: 77
|
| 228 |
+
}
|
| 229 |
+
item {
|
| 230 |
+
name: "take (an object) from (a person)"
|
| 231 |
+
id: 78
|
| 232 |
+
}
|
| 233 |
+
item {
|
| 234 |
+
name: "talk to (e.g., self, a person, a group)"
|
| 235 |
+
id: 79
|
| 236 |
+
}
|
| 237 |
+
item {
|
| 238 |
+
name: "watch (a person)"
|
| 239 |
+
id: 80
|
| 240 |
+
}
|
coco.names
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
person
|
| 2 |
+
bicycle
|
| 3 |
+
car
|
| 4 |
+
motorbike
|
| 5 |
+
aeroplane
|
| 6 |
+
bus
|
| 7 |
+
train
|
| 8 |
+
truck
|
| 9 |
+
boat
|
| 10 |
+
traffic light
|
| 11 |
+
fire hydrant
|
| 12 |
+
stop sign
|
| 13 |
+
parking meter
|
| 14 |
+
bench
|
| 15 |
+
bird
|
| 16 |
+
cat
|
| 17 |
+
dog
|
| 18 |
+
horse
|
| 19 |
+
sheep
|
| 20 |
+
cow
|
| 21 |
+
elephant
|
| 22 |
+
bear
|
| 23 |
+
zebra
|
| 24 |
+
giraffe
|
| 25 |
+
backpack
|
| 26 |
+
umbrella
|
| 27 |
+
handbag
|
| 28 |
+
tie
|
| 29 |
+
suitcase
|
| 30 |
+
frisbee
|
| 31 |
+
skis
|
| 32 |
+
snowboard
|
| 33 |
+
sports ball
|
| 34 |
+
kite
|
| 35 |
+
baseball bat
|
| 36 |
+
baseball glove
|
| 37 |
+
skateboard
|
| 38 |
+
surfboard
|
| 39 |
+
tennis racket
|
| 40 |
+
bottle
|
| 41 |
+
wine glass
|
| 42 |
+
cup
|
| 43 |
+
fork
|
| 44 |
+
knife
|
| 45 |
+
spoon
|
| 46 |
+
bowl
|
| 47 |
+
banana
|
| 48 |
+
apple
|
| 49 |
+
sandwich
|
| 50 |
+
orange
|
| 51 |
+
broccoli
|
| 52 |
+
carrot
|
| 53 |
+
hot dog
|
| 54 |
+
pizza
|
| 55 |
+
donut
|
| 56 |
+
cake
|
| 57 |
+
chair
|
| 58 |
+
sofa
|
| 59 |
+
pottedplant
|
| 60 |
+
bed
|
| 61 |
+
diningtable
|
| 62 |
+
toilet
|
| 63 |
+
tvmonitor
|
| 64 |
+
laptop
|
| 65 |
+
mouse
|
| 66 |
+
remote
|
| 67 |
+
keyboard
|
| 68 |
+
cell phone
|
| 69 |
+
microwave
|
| 70 |
+
oven
|
| 71 |
+
toaster
|
| 72 |
+
sink
|
| 73 |
+
refrigerator
|
| 74 |
+
book
|
| 75 |
+
clock
|
| 76 |
+
vase
|
| 77 |
+
scissors
|
| 78 |
+
teddy bear
|
| 79 |
+
hair drier
|
| 80 |
+
toothbrush
|
environment.yml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: env
|
| 2 |
+
dependencies:
|
| 3 |
+
- cudatoolkit
|
| 4 |
+
- pip:
|
| 5 |
+
- -r requirements.txt
|
requirements.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
imutils
|
| 2 |
+
matplotlib
|
| 3 |
+
numpy
|
| 4 |
+
pandas
|
| 5 |
+
opencv-python
|
| 6 |
+
ffmpeg-python
|
| 7 |
+
pytorchvideo
|
| 8 |
+
|
| 9 |
+
cython
|
| 10 |
+
scipy
|
| 11 |
+
tqdm
|
| 12 |
+
gdown
|
| 13 |
+
cmake
|
| 14 |
+
|
| 15 |
+
#Torch
|
| 16 |
+
--find-links https://download.pytorch.org/whl/cu111
|
| 17 |
+
torch==1.10.0
|
| 18 |
+
torchvision==0.11.1
|
| 19 |
+
|
| 20 |
+
# Detectron
|
| 21 |
+
--find-links https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html
|
| 22 |
+
detectron2
|
| 23 |
+
|
| 24 |
+
moviepy
|
| 25 |
+
pyannote.audio
|
| 26 |
+
scikit-learn
|
| 27 |
+
librosa
|
| 28 |
+
transformers
|
slowfast.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import cv2
|
| 4 |
+
import torch
|
| 5 |
+
import warnings
|
| 6 |
+
from detectron2.config import get_cfg
|
| 7 |
+
from detectron2 import model_zoo
|
| 8 |
+
from detectron2.engine import DefaultPredictor
|
| 9 |
+
import ffmpeg
|
| 10 |
+
import pytorchvideo
|
| 11 |
+
from pytorchvideo.transforms.functional import (
|
| 12 |
+
uniform_temporal_subsample,
|
| 13 |
+
short_side_scale_with_boxes,
|
| 14 |
+
clip_boxes_to_image
|
| 15 |
+
)
|
| 16 |
+
from torchvision.transforms._functional_video import normalize
|
| 17 |
+
from pytorchvideo.data.ava import AvaLabeledVideoFramePaths
|
| 18 |
+
from pytorchvideo.models.hub import slowfast_r50_detection # Another option is slow_r50_detection
|
| 19 |
+
from visualization import VideoVisualizer
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# This method takes in an image and generates the bounding boxes for people in the image.
|
| 23 |
+
def get_person_bboxes(inp_img, predictor):
|
| 24 |
+
predictions = predictor(inp_img.cpu().detach().numpy())['instances'].to('cpu')
|
| 25 |
+
boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
|
| 26 |
+
scores = predictions.scores if predictions.has("scores") else None
|
| 27 |
+
classes = np.array(predictions.pred_classes.tolist() if predictions.has("pred_classes") else None)
|
| 28 |
+
predicted_boxes = boxes[np.logical_and(classes==0, scores>0.75 )].tensor.cpu() # only person
|
| 29 |
+
return predicted_boxes
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def ava_inference_transform(
|
| 33 |
+
clip,
|
| 34 |
+
boxes,
|
| 35 |
+
num_frames = 32, # 4 if using slowfast_r50_detection, change this to 32
|
| 36 |
+
crop_size = 256,
|
| 37 |
+
data_mean = [0.45, 0.45, 0.45],
|
| 38 |
+
data_std = [0.225, 0.225, 0.225],
|
| 39 |
+
slow_fast_alpha = 4, # if using slowfast_r50_detection, change None to 4
|
| 40 |
+
device = 'cpu'):
|
| 41 |
+
|
| 42 |
+
boxes = np.array(boxes)
|
| 43 |
+
ori_boxes = boxes.copy()
|
| 44 |
+
|
| 45 |
+
# Image [0, 255] -> [0, 1].
|
| 46 |
+
clip = uniform_temporal_subsample(clip, num_frames)
|
| 47 |
+
clip = clip.float()
|
| 48 |
+
clip = clip / 255.0
|
| 49 |
+
|
| 50 |
+
height, width = clip.shape[2], clip.shape[3]
|
| 51 |
+
# The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
|
| 52 |
+
# range of [0, width] for x and [0,height] for y
|
| 53 |
+
boxes = clip_boxes_to_image(boxes, height, width)
|
| 54 |
+
|
| 55 |
+
# Resize short side to crop_size. Non-local and STRG uses 256.
|
| 56 |
+
clip, boxes = short_side_scale_with_boxes(clip, size=crop_size, boxes=boxes)
|
| 57 |
+
|
| 58 |
+
# Normalize images by mean and std.
|
| 59 |
+
clip = normalize(clip, np.array(data_mean, dtype=np.float32), np.array(data_std, dtype=np.float32))
|
| 60 |
+
|
| 61 |
+
boxes = clip_boxes_to_image(boxes, clip.shape[2], clip.shape[3])
|
| 62 |
+
|
| 63 |
+
# Incase of slowfast, generate both pathways
|
| 64 |
+
if slow_fast_alpha is not None:
|
| 65 |
+
fast_pathway = clip
|
| 66 |
+
# Perform temporal sampling from the fast pathway.
|
| 67 |
+
slow_pathway = torch.index_select(clip, 1, torch.linspace(
|
| 68 |
+
0, clip.shape[1] - 1, clip.shape[1] // slow_fast_alpha).long())
|
| 69 |
+
clip = [slow_pathway.unsqueeze(0).to(device), fast_pathway.unsqueeze(0).to(device)]
|
| 70 |
+
|
| 71 |
+
return clip, torch.from_numpy(boxes), ori_boxes
|
| 72 |
+
|
| 73 |
+
# get video info
|
| 74 |
+
def with_opencv(filename):
|
| 75 |
+
video = cv2.VideoCapture(filename)
|
| 76 |
+
frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
|
| 77 |
+
fps = video.get(cv2.CAP_PROP_FPS)
|
| 78 |
+
s = round(frame_count / fps)
|
| 79 |
+
video.release()
|
| 80 |
+
return int(s), fps
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def slow_fast_train(file_path, gpu=False):
|
| 84 |
+
device = 'cuda' if gpu else 'cpu'
|
| 85 |
+
top_k = 1
|
| 86 |
+
|
| 87 |
+
video_model = slowfast_r50_detection(True) # Another option is slow_r50_detection(True)
|
| 88 |
+
video_model = video_model.eval().to(device)
|
| 89 |
+
cfg = get_cfg()
|
| 90 |
+
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
|
| 91 |
+
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.55 # set threshold for this model
|
| 92 |
+
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
|
| 93 |
+
cfg.MODEL.DEVICE = device
|
| 94 |
+
predictor = DefaultPredictor(cfg)
|
| 95 |
+
# Create an id to label name mapping
|
| 96 |
+
label_map, allowed_class_ids = AvaLabeledVideoFramePaths.read_label_map('ava_action_list.pbtxt')
|
| 97 |
+
# Create a video visualizer that can plot bounding boxes and visualize actions on bboxes.
|
| 98 |
+
video_visualizer = VideoVisualizer(81, label_map, top_k=top_k, mode="thres",thres=0.5) #get top3 predictions show in each bounding box
|
| 99 |
+
|
| 100 |
+
#preprocess video data
|
| 101 |
+
encoded_vid = pytorchvideo.data.encoded_video.EncodedVideo.from_path(file_path)
|
| 102 |
+
|
| 103 |
+
# Video predictions are generated each frame/second for the wholevideo.
|
| 104 |
+
total_sec, fps = with_opencv(file_path)
|
| 105 |
+
time_stamp_range = range(0, total_sec) # time stamps in video for which clip is sampled
|
| 106 |
+
clip_duration = 1.0 # Duration of clip used for each inference step.
|
| 107 |
+
gif_imgs = []
|
| 108 |
+
xleft, ytop, xright, ybottom = [], [], [], []
|
| 109 |
+
labels = []
|
| 110 |
+
time_frame = []
|
| 111 |
+
scores = []
|
| 112 |
+
|
| 113 |
+
for time_stamp in time_stamp_range:
|
| 114 |
+
|
| 115 |
+
# Generate clip around the designated time stamps
|
| 116 |
+
inp_imgs = encoded_vid.get_clip(
|
| 117 |
+
time_stamp - clip_duration/2.0,
|
| 118 |
+
time_stamp + clip_duration/2.0)
|
| 119 |
+
inp_imgs = inp_imgs['video']
|
| 120 |
+
|
| 121 |
+
#if time_stamp % 15 == 0:
|
| 122 |
+
# Generate people bbox predictions using Detectron2's off the self pre-trained predictor
|
| 123 |
+
# We use the the middle image in each clip to generate the bounding boxes.
|
| 124 |
+
inp_img = inp_imgs[:,inp_imgs.shape[1]//2,:,:]
|
| 125 |
+
inp_img = inp_img.permute(1,2,0)
|
| 126 |
+
|
| 127 |
+
# Predicted boxes are of the form List[(x_1, y_1, x_2, y_2)]
|
| 128 |
+
predicted_boxes = get_person_bboxes(inp_img, predictor)
|
| 129 |
+
if len(predicted_boxes) == 0:
|
| 130 |
+
print("Skipping clip no frames detected at time stamp: ", time_stamp)
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Preprocess clip and bounding boxes for video action recognition.
|
| 134 |
+
inputs, inp_boxes, _ = ava_inference_transform(inp_imgs, predicted_boxes.numpy(), device=device)
|
| 135 |
+
# Prepend data sample id for each bounding box.
|
| 136 |
+
# For more details refere to the RoIAlign in Detectron2
|
| 137 |
+
inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0],1), inp_boxes], dim=1)
|
| 138 |
+
|
| 139 |
+
# Generate actions predictions for the bounding boxes in the clip.
|
| 140 |
+
# The model here takes in the pre-processed video clip and the detected bounding boxes.
|
| 141 |
+
preds = video_model(inputs, inp_boxes.to(device)) #change inputs to inputs.unsqueeze(0).to(device) if using slow_r50
|
| 142 |
+
|
| 143 |
+
preds = preds.to('cpu')
|
| 144 |
+
# The model is trained on AVA and AVA labels are 1 indexed so, prepend 0 to convert to 0 index.
|
| 145 |
+
preds = torch.cat([torch.zeros(preds.shape[0],1), preds], dim=1)
|
| 146 |
+
|
| 147 |
+
# Plot predictions on the video and save for later visualization.
|
| 148 |
+
inp_imgs = inp_imgs.permute(1,2,3,0)
|
| 149 |
+
inp_imgs = inp_imgs/255.0
|
| 150 |
+
out_img_pred = video_visualizer.draw_clip_range(inp_imgs, preds, predicted_boxes)
|
| 151 |
+
gif_imgs += out_img_pred
|
| 152 |
+
|
| 153 |
+
#format of bboxes(x_left, y_top, x_right, y_bottom)
|
| 154 |
+
predicted_boxes_lst = predicted_boxes.tolist()
|
| 155 |
+
topscores, topclasses = torch.topk(preds, k=1)
|
| 156 |
+
topscores, topclasses = topscores.tolist(), topclasses.tolist()
|
| 157 |
+
topclasses = np.concatenate(topclasses)
|
| 158 |
+
topscores = np.concatenate(topscores)
|
| 159 |
+
|
| 160 |
+
#add top 1 prediction of behaviors in each time step
|
| 161 |
+
for i in range(len(predicted_boxes_lst)):
|
| 162 |
+
xleft.append(predicted_boxes_lst[i][0])
|
| 163 |
+
ytop.append(predicted_boxes_lst[i][1])
|
| 164 |
+
xright.append(predicted_boxes_lst[i][2])
|
| 165 |
+
ybottom.append(predicted_boxes_lst[i][3])
|
| 166 |
+
labels.append(label_map.get(topclasses[i]))
|
| 167 |
+
time_frame.append(time_stamp)
|
| 168 |
+
scores.append(topscores[i])
|
| 169 |
+
|
| 170 |
+
print("Finished generating predictions.")
|
| 171 |
+
# Generate Metadata file
|
| 172 |
+
metadata = pd.DataFrame()
|
| 173 |
+
metadata['frame'] = time_frame
|
| 174 |
+
metadata['x_left'] = xleft
|
| 175 |
+
metadata['y_top'] = ytop
|
| 176 |
+
metadata['x_right'] = xright
|
| 177 |
+
metadata['y_bottom'] = ybottom
|
| 178 |
+
metadata['label'] = labels
|
| 179 |
+
metadata['confidence'] = scores
|
| 180 |
+
|
| 181 |
+
height, width = gif_imgs[0].shape[0], gif_imgs[0].shape[1]
|
| 182 |
+
video_save_path = 'activity_recognition.mp4'
|
| 183 |
+
video = cv2.VideoWriter(video_save_path, cv2.VideoWriter_fourcc(*'mp4v'), int(fps), (width, height))
|
| 184 |
+
|
| 185 |
+
for image in gif_imgs:
|
| 186 |
+
img = (255*image).astype(np.uint8)
|
| 187 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 188 |
+
video.write(img)
|
| 189 |
+
video.release()
|
| 190 |
+
|
| 191 |
+
return video_save_path, metadata
|
video_object_extraction.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Tue Nov 8 16:18:28 2022
|
| 5 |
+
|
| 6 |
+
@author: ariellee
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import cv2
|
| 12 |
+
import numpy as np
|
| 13 |
+
from imutils.video import FPS
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# def str2bool(v):
|
| 19 |
+
# """
|
| 20 |
+
# Converts string to bool type, enables command line
|
| 21 |
+
# arguments in the format of '--arg1 true --arg2 false'
|
| 22 |
+
# """
|
| 23 |
+
# if isinstance(v, bool):
|
| 24 |
+
# return v
|
| 25 |
+
# if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
| 26 |
+
# return True
|
| 27 |
+
# elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
| 28 |
+
# return False
|
| 29 |
+
# else:
|
| 30 |
+
# raise argparse.ArgumentTypeError('Boolean value expected (true/false)')
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# def get_args_parser():
|
| 34 |
+
# parser = argparse.ArgumentParser('Wheelock evaluation script for classroom object detection',
|
| 35 |
+
# add_help=False)
|
| 36 |
+
|
| 37 |
+
# parser.add_argument('--output_dir', default='', type=str,
|
| 38 |
+
# help='path to save the feature extraction results')
|
| 39 |
+
|
| 40 |
+
# parser.add_argument('--output_name', default='video_out', type=str, help='name of csv \
|
| 41 |
+
# file with object features and annotated video with object tracking \
|
| 42 |
+
# and bounding boxes')
|
| 43 |
+
|
| 44 |
+
# parser.add_argument('--video_path', default='short',
|
| 45 |
+
# type=str, help='path to input video, do not include file extension')
|
| 46 |
+
|
| 47 |
+
# parser.add_argument('--is_mp4', type=str2bool, default=False,
|
| 48 |
+
# help='must be an mp4 file')
|
| 49 |
+
|
| 50 |
+
# parser.add_argument('--save_csv', type=str2bool, default=True,
|
| 51 |
+
# help='if true, a csv file of extracted features will be saved in output_dir')
|
| 52 |
+
|
| 53 |
+
# parser.add_argument('--labels', default='coco.names', type=str,
|
| 54 |
+
# help='labels for classes model can detect')
|
| 55 |
+
|
| 56 |
+
# parser.add_argument('--weights', default='yolov3.weights', type=str,
|
| 57 |
+
# help='weights for pretrained yolo model')
|
| 58 |
+
|
| 59 |
+
# parser.add_argument('--cfg', default='yolov3.cfg', type=str,
|
| 60 |
+
# help='model configuration parameters')
|
| 61 |
+
# return parser
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def video_object_extraction(video_path, frames):
|
| 65 |
+
'''
|
| 66 |
+
Object detection and feature extraction with yolov3
|
| 67 |
+
Uses darknet repo by pjreddie
|
| 68 |
+
|
| 69 |
+
Returns: (1) csv file with extracted object features
|
| 70 |
+
columns: frame_number, x_start, y_start, x_end, y_end, label, confidence
|
| 71 |
+
(2) mp4 video with object bounding boxes and tracking
|
| 72 |
+
|
| 73 |
+
'''
|
| 74 |
+
# video_path = args.video_path + '.mp4'
|
| 75 |
+
print('Reading from video {}...'.format(video_path))
|
| 76 |
+
cap = cv2.VideoCapture(video_path)
|
| 77 |
+
|
| 78 |
+
# get total number of frames in the video
|
| 79 |
+
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
| 80 |
+
|
| 81 |
+
# get height and width of video
|
| 82 |
+
H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 83 |
+
W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 84 |
+
|
| 85 |
+
fps = FPS().start()
|
| 86 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 87 |
+
|
| 88 |
+
# (cols, rows) format
|
| 89 |
+
# root = os.path.join(args.output_dir, args.output_name)
|
| 90 |
+
wp = 'object_detection.mp4'
|
| 91 |
+
g_fps = int(cap.get(cv2.CAP_PROP_FPS))
|
| 92 |
+
writer = cv2.VideoWriter(wp, fourcc, g_fps, (W, H))
|
| 93 |
+
# labels = open(args.labels).read().strip().split('\n')
|
| 94 |
+
labels = open('coco.names').read().strip().split('\n')
|
| 95 |
+
bbox_colors = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8')
|
| 96 |
+
|
| 97 |
+
yolo = cv2.dnn.readNetFromDarknet('yolov3.cfg', 'yolov3.weights')
|
| 98 |
+
out_layers = yolo.getLayerNames()
|
| 99 |
+
layers = [out_layers[i - 1] for i in yolo.getUnconnectedOutLayers()]
|
| 100 |
+
count = 0
|
| 101 |
+
stat_list = []
|
| 102 |
+
|
| 103 |
+
while count < total_frames:
|
| 104 |
+
|
| 105 |
+
_, frame = cap.read()
|
| 106 |
+
|
| 107 |
+
if count == 0 or count % frames == 0:
|
| 108 |
+
blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True)
|
| 109 |
+
yolo.setInput(blob)
|
| 110 |
+
|
| 111 |
+
layer_outputs = yolo.forward(layers)
|
| 112 |
+
boxes = []
|
| 113 |
+
confidences = []
|
| 114 |
+
classes = []
|
| 115 |
+
|
| 116 |
+
# loop over layer outputs and objects detected
|
| 117 |
+
for output in layer_outputs:
|
| 118 |
+
for obj in output:
|
| 119 |
+
|
| 120 |
+
# extract class and detection likelihood of current object
|
| 121 |
+
scores = obj[5:]
|
| 122 |
+
obj_class = np.argmax(scores)
|
| 123 |
+
confidence = scores[obj_class]
|
| 124 |
+
|
| 125 |
+
# get rid of bad predictions
|
| 126 |
+
if confidence > 0.4:
|
| 127 |
+
|
| 128 |
+
# scale bbox coordinates relative to frame size
|
| 129 |
+
box = obj[0:4] * np.array([W, H, W, H])
|
| 130 |
+
centerX, centerY, width, height = box.astype('int')
|
| 131 |
+
|
| 132 |
+
# final coordiantes
|
| 133 |
+
x = int(centerX - (width / 2))
|
| 134 |
+
y = int(centerY - (height / 2))
|
| 135 |
+
|
| 136 |
+
# update list of bbox coordinates, confidences, classes
|
| 137 |
+
boxes.append([x, y, int(width), int(height)])
|
| 138 |
+
confidences.append(float(confidence))
|
| 139 |
+
classes.append(obj_class)
|
| 140 |
+
|
| 141 |
+
# non-max suppression for overlapping bounding boxes
|
| 142 |
+
idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.4, 0.4)
|
| 143 |
+
|
| 144 |
+
for i in idxs.flatten():
|
| 145 |
+
|
| 146 |
+
# extract coordinates
|
| 147 |
+
(x, y) = (boxes[i][0], boxes[i][1])
|
| 148 |
+
(w, h) = (boxes[i][2], boxes[i][3])
|
| 149 |
+
|
| 150 |
+
# set up + add bboxes to frame
|
| 151 |
+
color = [int(c) for c in bbox_colors[classes[i]]]
|
| 152 |
+
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
|
| 153 |
+
text = "{}: {:.4f}".format(labels[classes[i]], confidences[i])
|
| 154 |
+
(text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
|
| 155 |
+
cv2.rectangle(frame, (x, y - text_height), (x + text_width, y), color, cv2.FILLED)
|
| 156 |
+
cv2.putText(frame, text, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38, 38, 38), 2)
|
| 157 |
+
|
| 158 |
+
# format of each csv file is: frame number / x / y / w / h / label / confidence
|
| 159 |
+
stat_list.append([count, x, y, w, h, labels[classes[i]], confidences[i]])
|
| 160 |
+
|
| 161 |
+
writer.write(frame)
|
| 162 |
+
fps.update()
|
| 163 |
+
count += 1
|
| 164 |
+
|
| 165 |
+
df = pd.DataFrame(stat_list, columns=['frame', 'x_left', 'y_top', 'x_right',
|
| 166 |
+
'y_bottom', 'label', 'confidence'])
|
| 167 |
+
fps.stop()
|
| 168 |
+
print('Time elapsed (seconds): {:.2f}'.format(fps.elapsed()))
|
| 169 |
+
writer.release()
|
| 170 |
+
cap.release()
|
| 171 |
+
|
| 172 |
+
return wp, df
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# if __name__ == '__main__':
|
| 176 |
+
|
| 177 |
+
# parser = argparse.ArgumentParser('Wheelock evaluation script for classroom object detection', parents=[get_args_parser()])
|
| 178 |
+
# args = parser.parse_args()
|
| 179 |
+
|
| 180 |
+
# if not args.is_mp4:
|
| 181 |
+
# print('Video must be an mp4 file.')
|
| 182 |
+
# else:
|
| 183 |
+
# if args.output_dir:
|
| 184 |
+
# Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
| 185 |
+
# main(args)
|
visualization.py
ADDED
|
@@ -0,0 +1,706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
| 2 |
+
# Note: This file has been barrowed from facebookresearch/slowfast repo. And it is used to add the bounding boxes and predictions to the frame.
|
| 3 |
+
# TODO: Migrate this into the core PyTorchVideo libarary.
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import itertools
|
| 7 |
+
# import logging
|
| 8 |
+
from types import SimpleNamespace
|
| 9 |
+
from typing import Dict, List, Optional, Tuple, Union
|
| 10 |
+
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
from detectron2.utils.visualizer import Visualizer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _create_text_labels(
|
| 21 |
+
classes: List[int],
|
| 22 |
+
scores: List[float],
|
| 23 |
+
class_names: List[str],
|
| 24 |
+
ground_truth: bool = False,
|
| 25 |
+
) -> List[str]:
|
| 26 |
+
"""
|
| 27 |
+
Create text labels.
|
| 28 |
+
Args:
|
| 29 |
+
classes (list[int]): a list of class ids for each example.
|
| 30 |
+
scores (list[float] or None): list of scores for each example.
|
| 31 |
+
class_names (list[str]): a list of class names, ordered by their ids.
|
| 32 |
+
ground_truth (bool): whether the labels are ground truth.
|
| 33 |
+
Returns:
|
| 34 |
+
labels (list[str]): formatted text labels.
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
labels = [class_names.get(c, "n/a") for c in classes]
|
| 38 |
+
except IndexError:
|
| 39 |
+
# logger.error("Class indices get out of range: {}".format(classes))
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
if ground_truth:
|
| 43 |
+
labels = ["[{}] {}".format("GT", label) for label in labels]
|
| 44 |
+
elif scores is not None:
|
| 45 |
+
assert len(classes) == len(scores)
|
| 46 |
+
labels = ["[{:.2f}] {}".format(s, label) for s, label in zip(scores, labels)]
|
| 47 |
+
return labels
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class ImgVisualizer(Visualizer):
|
| 51 |
+
def __init__(
|
| 52 |
+
self, img_rgb: torch.Tensor, meta: Optional[SimpleNamespace] = None, **kwargs
|
| 53 |
+
) -> None:
|
| 54 |
+
"""
|
| 55 |
+
See https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/visualizer.py
|
| 56 |
+
for more details.
|
| 57 |
+
Args:
|
| 58 |
+
img_rgb: a tensor or numpy array of shape (H, W, C), where H and W correspond to
|
| 59 |
+
the height and width of the image respectively. C is the number of
|
| 60 |
+
color channels. The image is required to be in RGB format since that
|
| 61 |
+
is a requirement of the Matplotlib library. The image is also expected
|
| 62 |
+
to be in the range [0, 255].
|
| 63 |
+
meta (MetadataCatalog): image metadata.
|
| 64 |
+
See https://github.com/facebookresearch/detectron2/blob/81d5a87763bfc71a492b5be89b74179bd7492f6b/detectron2/data/catalog.py#L90
|
| 65 |
+
"""
|
| 66 |
+
super(ImgVisualizer, self).__init__(img_rgb, meta, **kwargs)
|
| 67 |
+
|
| 68 |
+
def draw_text(
|
| 69 |
+
self,
|
| 70 |
+
text: str,
|
| 71 |
+
position: List[int],
|
| 72 |
+
*,
|
| 73 |
+
font_size: Optional[int] = None,
|
| 74 |
+
color: str = "w",
|
| 75 |
+
horizontal_alignment: str = "center",
|
| 76 |
+
vertical_alignment: str = "bottom",
|
| 77 |
+
box_facecolor: str = "black",
|
| 78 |
+
alpha: float = 0.5,
|
| 79 |
+
) -> None:
|
| 80 |
+
"""
|
| 81 |
+
Draw text at the specified position.
|
| 82 |
+
Args:
|
| 83 |
+
text (str): the text to draw on image.
|
| 84 |
+
position (list of 2 ints): the x,y coordinate to place the text.
|
| 85 |
+
font_size (Optional[int]): font of the text. If not provided, a font size
|
| 86 |
+
proportional to the image width is calculated and used.
|
| 87 |
+
color (str): color of the text. Refer to `matplotlib.colors` for full list
|
| 88 |
+
of formats that are accepted.
|
| 89 |
+
horizontal_alignment (str): see `matplotlib.text.Text`.
|
| 90 |
+
vertical_alignment (str): see `matplotlib.text.Text`.
|
| 91 |
+
box_facecolor (str): color of the box wrapped around the text. Refer to
|
| 92 |
+
`matplotlib.colors` for full list of formats that are accepted.
|
| 93 |
+
alpha (float): transparency level of the box.
|
| 94 |
+
"""
|
| 95 |
+
if not font_size:
|
| 96 |
+
font_size = self._default_font_size
|
| 97 |
+
x, y = position
|
| 98 |
+
self.output.ax.text(
|
| 99 |
+
x,
|
| 100 |
+
y,
|
| 101 |
+
text,
|
| 102 |
+
size=font_size * self.output.scale,
|
| 103 |
+
family="monospace",
|
| 104 |
+
bbox={
|
| 105 |
+
"facecolor": box_facecolor,
|
| 106 |
+
"alpha": alpha,
|
| 107 |
+
"pad": 0.7,
|
| 108 |
+
"edgecolor": "none",
|
| 109 |
+
},
|
| 110 |
+
verticalalignment=vertical_alignment,
|
| 111 |
+
horizontalalignment=horizontal_alignment,
|
| 112 |
+
color=color,
|
| 113 |
+
zorder=10,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def draw_multiple_text(
|
| 117 |
+
self,
|
| 118 |
+
text_ls: List[str],
|
| 119 |
+
box_coordinate: torch.Tensor,
|
| 120 |
+
*,
|
| 121 |
+
top_corner: bool = True,
|
| 122 |
+
font_size: Optional[int] = None,
|
| 123 |
+
color: str = "w",
|
| 124 |
+
box_facecolors: str = "black",
|
| 125 |
+
alpha: float = 0.5,
|
| 126 |
+
) -> None:
|
| 127 |
+
"""
|
| 128 |
+
Draw a list of text labels for some bounding box on the image.
|
| 129 |
+
Args:
|
| 130 |
+
text_ls (list of strings): a list of text labels.
|
| 131 |
+
box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 132 |
+
coordinates of the box.
|
| 133 |
+
top_corner (bool): If True, draw the text labels at (x_left, y_top) of the box.
|
| 134 |
+
Else, draw labels at (x_left, y_bottom).
|
| 135 |
+
font_size (Optional[int]): font of the text. If not provided, a font size
|
| 136 |
+
proportional to the image width is calculated and used.
|
| 137 |
+
color (str): color of the text. Refer to `matplotlib.colors` for full list
|
| 138 |
+
of formats that are accepted.
|
| 139 |
+
box_facecolors (str): colors of the box wrapped around the text. Refer to
|
| 140 |
+
`matplotlib.colors` for full list of formats that are accepted.
|
| 141 |
+
alpha (float): transparency level of the box.
|
| 142 |
+
"""
|
| 143 |
+
if not isinstance(box_facecolors, list):
|
| 144 |
+
box_facecolors = [box_facecolors] * len(text_ls)
|
| 145 |
+
assert len(box_facecolors) == len(
|
| 146 |
+
text_ls
|
| 147 |
+
), "Number of colors provided is not equal to the number of text labels."
|
| 148 |
+
if not font_size:
|
| 149 |
+
font_size = self._default_font_size
|
| 150 |
+
text_box_width = font_size + font_size // 2
|
| 151 |
+
# If the texts does not fit in the assigned location,
|
| 152 |
+
# we split the text and draw it in another place.
|
| 153 |
+
if top_corner:
|
| 154 |
+
num_text_split = self._align_y_top(
|
| 155 |
+
box_coordinate, len(text_ls), text_box_width
|
| 156 |
+
)
|
| 157 |
+
y_corner = 1
|
| 158 |
+
else:
|
| 159 |
+
num_text_split = len(text_ls) - self._align_y_bottom(
|
| 160 |
+
box_coordinate, len(text_ls), text_box_width
|
| 161 |
+
)
|
| 162 |
+
y_corner = 3
|
| 163 |
+
|
| 164 |
+
text_color_sorted = sorted(
|
| 165 |
+
zip(text_ls, box_facecolors), key=lambda x: x[0], reverse=True
|
| 166 |
+
)
|
| 167 |
+
if len(text_color_sorted) != 0:
|
| 168 |
+
text_ls, box_facecolors = zip(*text_color_sorted)
|
| 169 |
+
else:
|
| 170 |
+
text_ls, box_facecolors = [], []
|
| 171 |
+
text_ls, box_facecolors = list(text_ls), list(box_facecolors)
|
| 172 |
+
self.draw_multiple_text_upward(
|
| 173 |
+
text_ls[:num_text_split][::-1],
|
| 174 |
+
box_coordinate,
|
| 175 |
+
y_corner=y_corner,
|
| 176 |
+
font_size=font_size,
|
| 177 |
+
color=color,
|
| 178 |
+
box_facecolors=box_facecolors[:num_text_split][::-1],
|
| 179 |
+
alpha=alpha,
|
| 180 |
+
)
|
| 181 |
+
self.draw_multiple_text_downward(
|
| 182 |
+
text_ls[num_text_split:],
|
| 183 |
+
box_coordinate,
|
| 184 |
+
y_corner=y_corner,
|
| 185 |
+
font_size=font_size,
|
| 186 |
+
color=color,
|
| 187 |
+
box_facecolors=box_facecolors[num_text_split:],
|
| 188 |
+
alpha=alpha,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
def draw_multiple_text_upward(
|
| 192 |
+
self,
|
| 193 |
+
text_ls: List[str],
|
| 194 |
+
box_coordinate: torch.Tensor,
|
| 195 |
+
*,
|
| 196 |
+
y_corner: int = 1,
|
| 197 |
+
font_size: Optional[int] = None,
|
| 198 |
+
color: str = "w",
|
| 199 |
+
box_facecolors: str = "black",
|
| 200 |
+
alpha: float = 0.5,
|
| 201 |
+
) -> None:
|
| 202 |
+
"""
|
| 203 |
+
Draw a list of text labels for some bounding box on the image in upward direction.
|
| 204 |
+
The next text label will be on top of the previous one.
|
| 205 |
+
Args:
|
| 206 |
+
text_ls (list of strings): a list of text labels.
|
| 207 |
+
box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 208 |
+
coordinates of the box.
|
| 209 |
+
y_corner (int): Value of either 1 or 3. Indicate the index of the y-coordinate of
|
| 210 |
+
the box to draw labels around.
|
| 211 |
+
font_size (Optional[int]): font of the text. If not provided, a font size
|
| 212 |
+
proportional to the image width is calculated and used.
|
| 213 |
+
color (str): color of the text. Refer to `matplotlib.colors` for full list
|
| 214 |
+
of formats that are accepted.
|
| 215 |
+
box_facecolors (str or list of strs): colors of the box wrapped around the
|
| 216 |
+
text. Refer to `matplotlib.colors` for full list of formats that
|
| 217 |
+
are accepted.
|
| 218 |
+
alpha (float): transparency level of the box.
|
| 219 |
+
"""
|
| 220 |
+
if not isinstance(box_facecolors, list):
|
| 221 |
+
box_facecolors = [box_facecolors] * len(text_ls)
|
| 222 |
+
assert len(box_facecolors) == len(
|
| 223 |
+
text_ls
|
| 224 |
+
), "Number of colors provided is not equal to the number of text labels."
|
| 225 |
+
|
| 226 |
+
assert y_corner in [1, 3], "Y_corner must be either 1 or 3"
|
| 227 |
+
if not font_size:
|
| 228 |
+
font_size = self._default_font_size
|
| 229 |
+
|
| 230 |
+
x, horizontal_alignment = self._align_x_coordinate(box_coordinate)
|
| 231 |
+
y = box_coordinate[y_corner].item()
|
| 232 |
+
for i, text in enumerate(text_ls):
|
| 233 |
+
self.draw_text(
|
| 234 |
+
text,
|
| 235 |
+
(x, y),
|
| 236 |
+
font_size=font_size,
|
| 237 |
+
color=color,
|
| 238 |
+
horizontal_alignment=horizontal_alignment,
|
| 239 |
+
vertical_alignment="bottom",
|
| 240 |
+
box_facecolor=box_facecolors[i],
|
| 241 |
+
alpha=alpha,
|
| 242 |
+
)
|
| 243 |
+
y -= font_size + font_size // 2
|
| 244 |
+
|
| 245 |
+
def draw_multiple_text_downward(
|
| 246 |
+
self,
|
| 247 |
+
text_ls: List[str],
|
| 248 |
+
box_coordinate: torch.Tensor,
|
| 249 |
+
*,
|
| 250 |
+
y_corner: int = 1,
|
| 251 |
+
font_size: Optional[int] = None,
|
| 252 |
+
color: str = "w",
|
| 253 |
+
box_facecolors: str = "black",
|
| 254 |
+
alpha: float = 0.5,
|
| 255 |
+
) -> None:
|
| 256 |
+
"""
|
| 257 |
+
Draw a list of text labels for some bounding box on the image in downward direction.
|
| 258 |
+
The next text label will be below the previous one.
|
| 259 |
+
Args:
|
| 260 |
+
text_ls (list of strings): a list of text labels.
|
| 261 |
+
box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 262 |
+
coordinates of the box.
|
| 263 |
+
y_corner (int): Value of either 1 or 3. Indicate the index of the y-coordinate of
|
| 264 |
+
the box to draw labels around.
|
| 265 |
+
font_size (Optional[int]): font of the text. If not provided, a font size
|
| 266 |
+
proportional to the image width is calculated and used.
|
| 267 |
+
color (str): color of the text. Refer to `matplotlib.colors` for full list
|
| 268 |
+
of formats that are accepted.
|
| 269 |
+
box_facecolors (str): colors of the box wrapped around the text. Refer to
|
| 270 |
+
`matplotlib.colors` for full list of formats that are accepted.
|
| 271 |
+
alpha (float): transparency level of the box.
|
| 272 |
+
"""
|
| 273 |
+
if not isinstance(box_facecolors, list):
|
| 274 |
+
box_facecolors = [box_facecolors] * len(text_ls)
|
| 275 |
+
assert len(box_facecolors) == len(
|
| 276 |
+
text_ls
|
| 277 |
+
), "Number of colors provided is not equal to the number of text labels."
|
| 278 |
+
|
| 279 |
+
assert y_corner in [1, 3], "Y_corner must be either 1 or 3"
|
| 280 |
+
if not font_size:
|
| 281 |
+
font_size = self._default_font_size
|
| 282 |
+
|
| 283 |
+
x, horizontal_alignment = self._align_x_coordinate(box_coordinate)
|
| 284 |
+
y = box_coordinate[y_corner].item()
|
| 285 |
+
for i, text in enumerate(text_ls):
|
| 286 |
+
self.draw_text(
|
| 287 |
+
text,
|
| 288 |
+
(x, y),
|
| 289 |
+
font_size=font_size,
|
| 290 |
+
color=color,
|
| 291 |
+
horizontal_alignment=horizontal_alignment,
|
| 292 |
+
vertical_alignment="top",
|
| 293 |
+
box_facecolor=box_facecolors[i],
|
| 294 |
+
alpha=alpha,
|
| 295 |
+
)
|
| 296 |
+
y += font_size + font_size // 2
|
| 297 |
+
|
| 298 |
+
def _align_x_coordinate(self, box_coordinate: torch.Tensor) -> Tuple[float, str]:
|
| 299 |
+
"""
|
| 300 |
+
Choose an x-coordinate from the box to make sure the text label
|
| 301 |
+
does not go out of frames. By default, the left x-coordinate is
|
| 302 |
+
chosen and text is aligned left. If the box is too close to the
|
| 303 |
+
right side of the image, then the right x-coordinate is chosen
|
| 304 |
+
instead and the text is aligned right.
|
| 305 |
+
Args:
|
| 306 |
+
box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 307 |
+
coordinates of the box.
|
| 308 |
+
Returns:
|
| 309 |
+
x_coordinate (float): the chosen x-coordinate.
|
| 310 |
+
alignment (str): whether to align left or right.
|
| 311 |
+
"""
|
| 312 |
+
# If the x-coordinate is greater than 5/6 of the image width,
|
| 313 |
+
# then we align test to the right of the box. This is
|
| 314 |
+
# chosen by heuristics.
|
| 315 |
+
if box_coordinate[0] > (self.output.width * 5) // 6:
|
| 316 |
+
return box_coordinate[2], "right"
|
| 317 |
+
|
| 318 |
+
return box_coordinate[0], "left"
|
| 319 |
+
|
| 320 |
+
def _align_y_top(
|
| 321 |
+
self, box_coordinate: torch.Tensor, num_text: int, textbox_width: float
|
| 322 |
+
) -> int:
|
| 323 |
+
"""
|
| 324 |
+
Calculate the number of text labels to plot on top of the box
|
| 325 |
+
without going out of frames.
|
| 326 |
+
Args:
|
| 327 |
+
box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 328 |
+
coordinates of the box.
|
| 329 |
+
num_text (int): the number of text labels to plot.
|
| 330 |
+
textbox_width (float): the width of the box wrapped around text label.
|
| 331 |
+
"""
|
| 332 |
+
dist_to_top = box_coordinate[1]
|
| 333 |
+
num_text_top = dist_to_top // textbox_width
|
| 334 |
+
|
| 335 |
+
if isinstance(num_text_top, torch.Tensor):
|
| 336 |
+
num_text_top = int(num_text_top.item())
|
| 337 |
+
|
| 338 |
+
return min(num_text, num_text_top)
|
| 339 |
+
|
| 340 |
+
def _align_y_bottom(
|
| 341 |
+
self, box_coordinate: torch.Tensor, num_text: int, textbox_width: float
|
| 342 |
+
) -> int:
|
| 343 |
+
"""
|
| 344 |
+
Calculate the number of text labels to plot at the bottom of the box
|
| 345 |
+
without going out of frames.
|
| 346 |
+
Args:
|
| 347 |
+
box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
|
| 348 |
+
coordinates of the box.
|
| 349 |
+
num_text (int): the number of text labels to plot.
|
| 350 |
+
textbox_width (float): the width of the box wrapped around text label.
|
| 351 |
+
"""
|
| 352 |
+
dist_to_bottom = self.output.height - box_coordinate[3]
|
| 353 |
+
num_text_bottom = dist_to_bottom // textbox_width
|
| 354 |
+
|
| 355 |
+
if isinstance(num_text_bottom, torch.Tensor):
|
| 356 |
+
num_text_bottom = int(num_text_bottom.item())
|
| 357 |
+
|
| 358 |
+
return min(num_text, num_text_bottom)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
class VideoVisualizer:
|
| 362 |
+
def __init__(
|
| 363 |
+
self,
|
| 364 |
+
num_classes: int,
|
| 365 |
+
class_names: Dict,
|
| 366 |
+
top_k: int = 1,
|
| 367 |
+
colormap: str = "rainbow",
|
| 368 |
+
thres: float = 0.7,
|
| 369 |
+
lower_thres: float = 0.3,
|
| 370 |
+
common_class_names: Optional[List[str]] = None,
|
| 371 |
+
mode: str = "top-k",
|
| 372 |
+
) -> None:
|
| 373 |
+
"""
|
| 374 |
+
Args:
|
| 375 |
+
num_classes (int): total number of classes.
|
| 376 |
+
class_names (dict): Dict mapping classID to name.
|
| 377 |
+
top_k (int): number of top predicted classes to plot.
|
| 378 |
+
colormap (str): the colormap to choose color for class labels from.
|
| 379 |
+
See https://matplotlib.org/tutorials/colors/colormaps.html
|
| 380 |
+
thres (float): threshold for picking predicted classes to visualize.
|
| 381 |
+
lower_thres (Optional[float]): If `common_class_names` if given,
|
| 382 |
+
this `lower_thres` will be applied to uncommon classes and
|
| 383 |
+
`thres` will be applied to classes in `common_class_names`.
|
| 384 |
+
common_class_names (Optional[list of str]): list of common class names
|
| 385 |
+
to apply `thres`. Class names not included in `common_class_names` will
|
| 386 |
+
have `lower_thres` as a threshold. If None, all classes will have
|
| 387 |
+
`thres` as a threshold. This is helpful for model trained on
|
| 388 |
+
highly imbalanced dataset.
|
| 389 |
+
mode (str): Supported modes are {"top-k", "thres"}.
|
| 390 |
+
This is used for choosing predictions for visualization.
|
| 391 |
+
"""
|
| 392 |
+
assert mode in ["top-k", "thres"], "Mode {} is not supported.".format(mode)
|
| 393 |
+
self.mode = mode
|
| 394 |
+
self.num_classes = num_classes
|
| 395 |
+
self.class_names = class_names
|
| 396 |
+
self.top_k = top_k
|
| 397 |
+
self.thres = thres
|
| 398 |
+
self.lower_thres = lower_thres
|
| 399 |
+
|
| 400 |
+
if mode == "thres":
|
| 401 |
+
self._get_thres_array(common_class_names=common_class_names)
|
| 402 |
+
|
| 403 |
+
self.color_map = plt.get_cmap(colormap)
|
| 404 |
+
|
| 405 |
+
def _get_color(self, class_id: int) -> List[float]:
|
| 406 |
+
"""
|
| 407 |
+
Get color for a class id.
|
| 408 |
+
Args:
|
| 409 |
+
class_id (int): class id.
|
| 410 |
+
"""
|
| 411 |
+
return self.color_map(class_id / self.num_classes)[:3]
|
| 412 |
+
|
| 413 |
+
def draw_one_frame(
|
| 414 |
+
self,
|
| 415 |
+
frame: Union[torch.Tensor, np.ndarray],
|
| 416 |
+
preds: Union[torch.Tensor, List[float]],
|
| 417 |
+
bboxes: Optional[torch.Tensor] = None,
|
| 418 |
+
alpha: float = 0.5,
|
| 419 |
+
text_alpha: float = 0.7,
|
| 420 |
+
ground_truth: bool = False,
|
| 421 |
+
) -> np.ndarray:
|
| 422 |
+
"""
|
| 423 |
+
Draw labels and bouding boxes for one image. By default, predicted
|
| 424 |
+
labels are drawn in the top left corner of the image or corresponding
|
| 425 |
+
bounding boxes. For ground truth labels (setting True for ground_truth flag),
|
| 426 |
+
labels will be drawn in the bottom left corner.
|
| 427 |
+
Args:
|
| 428 |
+
frame (array-like): a tensor or numpy array of shape (H, W, C),
|
| 429 |
+
where H and W correspond to
|
| 430 |
+
the height and width of the image respectively. C is the number of
|
| 431 |
+
color channels. The image is required to be in RGB format since that
|
| 432 |
+
is a requirement of the Matplotlib library. The image is also expected
|
| 433 |
+
to be in the range [0, 255].
|
| 434 |
+
preds (tensor or list): If ground_truth is False, provide a float tensor of
|
| 435 |
+
shape (num_boxes, num_classes) that contains all of the confidence
|
| 436 |
+
scores of the model. For recognition task, input shape can be (num_classes,).
|
| 437 |
+
To plot true label (ground_truth is True), preds is a list contains int32
|
| 438 |
+
of the shape (num_boxes, true_class_ids) or (true_class_ids,).
|
| 439 |
+
bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
|
| 440 |
+
of the bounding boxes.
|
| 441 |
+
alpha (Optional[float]): transparency level of the bounding boxes.
|
| 442 |
+
text_alpha (Optional[float]): transparency level of the box wrapped around
|
| 443 |
+
text labels.
|
| 444 |
+
ground_truth (bool): whether the prodived bounding boxes are ground-truth.
|
| 445 |
+
Returns:
|
| 446 |
+
An image with bounding box annotations and corresponding bbox
|
| 447 |
+
labels plotted on it.
|
| 448 |
+
"""
|
| 449 |
+
if isinstance(preds, torch.Tensor):
|
| 450 |
+
if preds.ndim == 1:
|
| 451 |
+
preds = preds.unsqueeze(0)
|
| 452 |
+
n_instances = preds.shape[0]
|
| 453 |
+
elif isinstance(preds, list):
|
| 454 |
+
n_instances = len(preds)
|
| 455 |
+
else:
|
| 456 |
+
# logger.error("Unsupported type of prediction input.")
|
| 457 |
+
return
|
| 458 |
+
|
| 459 |
+
if ground_truth:
|
| 460 |
+
top_scores, top_classes = [None] * n_instances, preds
|
| 461 |
+
|
| 462 |
+
elif self.mode == "top-k":
|
| 463 |
+
top_scores, top_classes = torch.topk(preds, k=self.top_k)
|
| 464 |
+
top_scores, top_classes = top_scores.tolist(), top_classes.tolist()
|
| 465 |
+
elif self.mode == "thres":
|
| 466 |
+
top_scores, top_classes = [], []
|
| 467 |
+
for pred in preds:
|
| 468 |
+
mask = pred >= self.thres
|
| 469 |
+
top_scores.append(pred[mask].tolist())
|
| 470 |
+
top_class = torch.squeeze(torch.nonzero(mask), dim=-1).tolist()
|
| 471 |
+
top_classes.append(top_class)
|
| 472 |
+
|
| 473 |
+
# Create labels top k predicted classes with their scores.
|
| 474 |
+
text_labels = []
|
| 475 |
+
for i in range(n_instances):
|
| 476 |
+
text_labels.append(
|
| 477 |
+
_create_text_labels(
|
| 478 |
+
top_classes[i],
|
| 479 |
+
top_scores[i],
|
| 480 |
+
self.class_names,
|
| 481 |
+
ground_truth=ground_truth,
|
| 482 |
+
)
|
| 483 |
+
)
|
| 484 |
+
frame_visualizer = ImgVisualizer(frame, meta=None)
|
| 485 |
+
font_size = min(max(np.sqrt(frame.shape[0] * frame.shape[1]) // 25, 5), 9)
|
| 486 |
+
top_corner = not ground_truth
|
| 487 |
+
if bboxes is not None:
|
| 488 |
+
assert len(preds) == len(
|
| 489 |
+
bboxes
|
| 490 |
+
), "Encounter {} predictions and {} bounding boxes".format(
|
| 491 |
+
len(preds), len(bboxes)
|
| 492 |
+
)
|
| 493 |
+
for i, box in enumerate(bboxes):
|
| 494 |
+
text = text_labels[i]
|
| 495 |
+
pred_class = top_classes[i]
|
| 496 |
+
colors = [self._get_color(pred) for pred in pred_class]
|
| 497 |
+
|
| 498 |
+
box_color = "r" if ground_truth else "g"
|
| 499 |
+
line_style = "--" if ground_truth else "-."
|
| 500 |
+
frame_visualizer.draw_box(
|
| 501 |
+
box,
|
| 502 |
+
alpha=alpha,
|
| 503 |
+
edge_color=box_color,
|
| 504 |
+
line_style=line_style,
|
| 505 |
+
)
|
| 506 |
+
frame_visualizer.draw_multiple_text(
|
| 507 |
+
text,
|
| 508 |
+
box,
|
| 509 |
+
top_corner=top_corner,
|
| 510 |
+
font_size=font_size,
|
| 511 |
+
box_facecolors=colors,
|
| 512 |
+
alpha=text_alpha,
|
| 513 |
+
)
|
| 514 |
+
else:
|
| 515 |
+
text = text_labels[0]
|
| 516 |
+
pred_class = top_classes[0]
|
| 517 |
+
colors = [self._get_color(pred) for pred in pred_class]
|
| 518 |
+
frame_visualizer.draw_multiple_text(
|
| 519 |
+
text,
|
| 520 |
+
torch.Tensor([0, 5, frame.shape[1], frame.shape[0] - 5]),
|
| 521 |
+
top_corner=top_corner,
|
| 522 |
+
font_size=font_size,
|
| 523 |
+
box_facecolors=colors,
|
| 524 |
+
alpha=text_alpha,
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
return frame_visualizer.output.get_image()
|
| 528 |
+
|
| 529 |
+
def draw_clip_range(
|
| 530 |
+
self,
|
| 531 |
+
frames: Union[torch.Tensor, np.ndarray],
|
| 532 |
+
preds: Union[torch.Tensor, List[float]],
|
| 533 |
+
bboxes: Optional[torch.Tensor] = None,
|
| 534 |
+
text_alpha: float = 0.5,
|
| 535 |
+
ground_truth: bool = False,
|
| 536 |
+
keyframe_idx: Optional[int] = None,
|
| 537 |
+
draw_range: Optional[List[int]] = None,
|
| 538 |
+
repeat_frame: int = 1,
|
| 539 |
+
) -> List[np.ndarray]:
|
| 540 |
+
"""
|
| 541 |
+
Draw predicted labels or ground truth classes to clip.
|
| 542 |
+
Draw bouding boxes to clip if bboxes is provided. Boxes will gradually
|
| 543 |
+
fade in and out the clip, centered around the clip's central frame,
|
| 544 |
+
within the provided `draw_range`.
|
| 545 |
+
Args:
|
| 546 |
+
frames (array-like): video data in the shape (T, H, W, C).
|
| 547 |
+
preds (tensor): a tensor of shape (num_boxes, num_classes) that
|
| 548 |
+
contains all of the confidence scores of the model. For recognition
|
| 549 |
+
task or for ground_truth labels, input shape can be (num_classes,).
|
| 550 |
+
bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
|
| 551 |
+
of the bounding boxes.
|
| 552 |
+
text_alpha (float): transparency label of the box wrapped around text labels.
|
| 553 |
+
ground_truth (bool): whether the prodived bounding boxes are ground-truth.
|
| 554 |
+
keyframe_idx (int): the index of keyframe in the clip.
|
| 555 |
+
draw_range (Optional[list[ints]): only draw frames in range
|
| 556 |
+
[start_idx, end_idx] inclusively in the clip. If None, draw on
|
| 557 |
+
the entire clip.
|
| 558 |
+
repeat_frame (int): repeat each frame in draw_range for `repeat_frame`
|
| 559 |
+
time for slow-motion effect.
|
| 560 |
+
Returns:
|
| 561 |
+
A list of frames with bounding box annotations and corresponding
|
| 562 |
+
bbox labels ploted on them.
|
| 563 |
+
"""
|
| 564 |
+
if draw_range is None:
|
| 565 |
+
draw_range = [0, len(frames) - 1]
|
| 566 |
+
if draw_range is not None:
|
| 567 |
+
draw_range[0] = max(0, draw_range[0])
|
| 568 |
+
left_frames = frames[: draw_range[0]]
|
| 569 |
+
right_frames = frames[draw_range[1] + 1 :]
|
| 570 |
+
|
| 571 |
+
draw_frames = frames[draw_range[0] : draw_range[1] + 1]
|
| 572 |
+
if keyframe_idx is None:
|
| 573 |
+
keyframe_idx = len(frames) // 2
|
| 574 |
+
|
| 575 |
+
img_ls = (
|
| 576 |
+
list(left_frames)
|
| 577 |
+
+ self.draw_clip(
|
| 578 |
+
draw_frames,
|
| 579 |
+
preds,
|
| 580 |
+
bboxes=bboxes,
|
| 581 |
+
text_alpha=text_alpha,
|
| 582 |
+
ground_truth=ground_truth,
|
| 583 |
+
keyframe_idx=keyframe_idx - draw_range[0],
|
| 584 |
+
repeat_frame=repeat_frame,
|
| 585 |
+
)
|
| 586 |
+
+ list(right_frames)
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
return img_ls
|
| 590 |
+
|
| 591 |
+
def draw_clip(
|
| 592 |
+
self,
|
| 593 |
+
frames: Union[torch.Tensor, np.ndarray],
|
| 594 |
+
preds: Union[torch.Tensor, List[float]],
|
| 595 |
+
bboxes: Optional[torch.Tensor] = None,
|
| 596 |
+
text_alpha: float = 0.5,
|
| 597 |
+
ground_truth: bool = False,
|
| 598 |
+
keyframe_idx: Optional[int] = None,
|
| 599 |
+
repeat_frame: int = 1,
|
| 600 |
+
) -> List[np.ndarray]:
|
| 601 |
+
"""
|
| 602 |
+
Draw predicted labels or ground truth classes to clip. Draw bouding boxes to clip
|
| 603 |
+
if bboxes is provided. Boxes will gradually fade in and out the clip, centered
|
| 604 |
+
around the clip's central frame.
|
| 605 |
+
Args:
|
| 606 |
+
frames (array-like): video data in the shape (T, H, W, C).
|
| 607 |
+
preds (tensor): a tensor of shape (num_boxes, num_classes) that contains
|
| 608 |
+
all of the confidence scores of the model. For recognition task or for
|
| 609 |
+
ground_truth labels, input shape can be (num_classes,).
|
| 610 |
+
bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
|
| 611 |
+
of the bounding boxes.
|
| 612 |
+
text_alpha (float): transparency label of the box wrapped around text labels.
|
| 613 |
+
ground_truth (bool): whether the prodived bounding boxes are ground-truth.
|
| 614 |
+
keyframe_idx (int): the index of keyframe in the clip.
|
| 615 |
+
repeat_frame (int): repeat each frame in draw_range for `repeat_frame`
|
| 616 |
+
time for slow-motion effect.
|
| 617 |
+
Returns:
|
| 618 |
+
A list of frames with bounding box annotations and corresponding
|
| 619 |
+
bbox labels plotted on them.
|
| 620 |
+
"""
|
| 621 |
+
assert repeat_frame >= 1, "`repeat_frame` must be a positive integer."
|
| 622 |
+
|
| 623 |
+
repeated_seq = range(0, len(frames))
|
| 624 |
+
repeated_seq = list(
|
| 625 |
+
itertools.chain.from_iterable(
|
| 626 |
+
itertools.repeat(x, repeat_frame) for x in repeated_seq
|
| 627 |
+
)
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
frames, adjusted = self._adjust_frames_type(frames)
|
| 631 |
+
if keyframe_idx is None:
|
| 632 |
+
half_left = len(repeated_seq) // 2
|
| 633 |
+
half_right = (len(repeated_seq) + 1) // 2
|
| 634 |
+
else:
|
| 635 |
+
mid = int((keyframe_idx / len(frames)) * len(repeated_seq))
|
| 636 |
+
half_left = mid
|
| 637 |
+
half_right = len(repeated_seq) - mid
|
| 638 |
+
|
| 639 |
+
alpha_ls = np.concatenate(
|
| 640 |
+
[
|
| 641 |
+
np.linspace(0, 1, num=half_left),
|
| 642 |
+
np.linspace(1, 0, num=half_right),
|
| 643 |
+
]
|
| 644 |
+
)
|
| 645 |
+
text_alpha = text_alpha
|
| 646 |
+
frames = frames[repeated_seq]
|
| 647 |
+
img_ls = []
|
| 648 |
+
for alpha, frame in zip(alpha_ls, frames):
|
| 649 |
+
draw_img = self.draw_one_frame(
|
| 650 |
+
frame,
|
| 651 |
+
preds,
|
| 652 |
+
bboxes,
|
| 653 |
+
alpha=alpha,
|
| 654 |
+
text_alpha=text_alpha,
|
| 655 |
+
ground_truth=ground_truth,
|
| 656 |
+
)
|
| 657 |
+
if adjusted:
|
| 658 |
+
draw_img = draw_img.astype("float32") / 255
|
| 659 |
+
|
| 660 |
+
img_ls.append(draw_img)
|
| 661 |
+
|
| 662 |
+
return img_ls
|
| 663 |
+
|
| 664 |
+
def _adjust_frames_type(
|
| 665 |
+
self, frames: torch.Tensor
|
| 666 |
+
) -> Tuple[List[np.ndarray], bool]:
|
| 667 |
+
"""
|
| 668 |
+
Modify video data to have dtype of uint8 and values range in [0, 255].
|
| 669 |
+
Args:
|
| 670 |
+
frames (array-like): 4D array of shape (T, H, W, C).
|
| 671 |
+
Returns:
|
| 672 |
+
frames (list of frames): list of frames in range [0, 1].
|
| 673 |
+
adjusted (bool): whether the original frames need adjusted.
|
| 674 |
+
"""
|
| 675 |
+
assert (
|
| 676 |
+
frames is not None and len(frames) != 0
|
| 677 |
+
), "Frames does not contain any values"
|
| 678 |
+
frames = np.array(frames)
|
| 679 |
+
assert np.array(frames).ndim == 4, "Frames must have 4 dimensions"
|
| 680 |
+
adjusted = False
|
| 681 |
+
if frames.dtype in [np.float32, np.float64]:
|
| 682 |
+
frames *= 255
|
| 683 |
+
frames = frames.astype(np.uint8)
|
| 684 |
+
adjusted = True
|
| 685 |
+
|
| 686 |
+
return frames, adjusted
|
| 687 |
+
|
| 688 |
+
def _get_thres_array(self, common_class_names: Optional[List[str]] = None) -> None:
|
| 689 |
+
"""
|
| 690 |
+
Compute a thresholds array for all classes based on `self.thes` and `self.lower_thres`.
|
| 691 |
+
Args:
|
| 692 |
+
common_class_names (Optional[list of str]): a list of common class names.
|
| 693 |
+
"""
|
| 694 |
+
common_class_ids = []
|
| 695 |
+
if common_class_names is not None:
|
| 696 |
+
common_classes = set(common_class_names)
|
| 697 |
+
|
| 698 |
+
for key, name in self.class_names.items():
|
| 699 |
+
if name in common_classes:
|
| 700 |
+
common_class_ids.append(key)
|
| 701 |
+
else:
|
| 702 |
+
common_class_ids = list(range(self.num_classes))
|
| 703 |
+
|
| 704 |
+
thres_array = np.full(shape=(self.num_classes,), fill_value=self.lower_thres)
|
| 705 |
+
thres_array[common_class_ids] = self.thres
|
| 706 |
+
self.thres = torch.from_numpy(thres_array)
|
yolov3.cfg
ADDED
|
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[net]
|
| 2 |
+
# Testing
|
| 3 |
+
# batch=1
|
| 4 |
+
# subdivisions=1
|
| 5 |
+
# Training
|
| 6 |
+
batch=64
|
| 7 |
+
subdivisions=16
|
| 8 |
+
width=608
|
| 9 |
+
height=608
|
| 10 |
+
channels=3
|
| 11 |
+
momentum=0.9
|
| 12 |
+
decay=0.0005
|
| 13 |
+
angle=0
|
| 14 |
+
saturation = 1.5
|
| 15 |
+
exposure = 1.5
|
| 16 |
+
hue=.1
|
| 17 |
+
|
| 18 |
+
learning_rate=0.001
|
| 19 |
+
burn_in=1000
|
| 20 |
+
max_batches = 500200
|
| 21 |
+
policy=steps
|
| 22 |
+
steps=400000,450000
|
| 23 |
+
scales=.1,.1
|
| 24 |
+
|
| 25 |
+
[convolutional]
|
| 26 |
+
batch_normalize=1
|
| 27 |
+
filters=32
|
| 28 |
+
size=3
|
| 29 |
+
stride=1
|
| 30 |
+
pad=1
|
| 31 |
+
activation=leaky
|
| 32 |
+
|
| 33 |
+
# Downsample
|
| 34 |
+
|
| 35 |
+
[convolutional]
|
| 36 |
+
batch_normalize=1
|
| 37 |
+
filters=64
|
| 38 |
+
size=3
|
| 39 |
+
stride=2
|
| 40 |
+
pad=1
|
| 41 |
+
activation=leaky
|
| 42 |
+
|
| 43 |
+
[convolutional]
|
| 44 |
+
batch_normalize=1
|
| 45 |
+
filters=32
|
| 46 |
+
size=1
|
| 47 |
+
stride=1
|
| 48 |
+
pad=1
|
| 49 |
+
activation=leaky
|
| 50 |
+
|
| 51 |
+
[convolutional]
|
| 52 |
+
batch_normalize=1
|
| 53 |
+
filters=64
|
| 54 |
+
size=3
|
| 55 |
+
stride=1
|
| 56 |
+
pad=1
|
| 57 |
+
activation=leaky
|
| 58 |
+
|
| 59 |
+
[shortcut]
|
| 60 |
+
from=-3
|
| 61 |
+
activation=linear
|
| 62 |
+
|
| 63 |
+
# Downsample
|
| 64 |
+
|
| 65 |
+
[convolutional]
|
| 66 |
+
batch_normalize=1
|
| 67 |
+
filters=128
|
| 68 |
+
size=3
|
| 69 |
+
stride=2
|
| 70 |
+
pad=1
|
| 71 |
+
activation=leaky
|
| 72 |
+
|
| 73 |
+
[convolutional]
|
| 74 |
+
batch_normalize=1
|
| 75 |
+
filters=64
|
| 76 |
+
size=1
|
| 77 |
+
stride=1
|
| 78 |
+
pad=1
|
| 79 |
+
activation=leaky
|
| 80 |
+
|
| 81 |
+
[convolutional]
|
| 82 |
+
batch_normalize=1
|
| 83 |
+
filters=128
|
| 84 |
+
size=3
|
| 85 |
+
stride=1
|
| 86 |
+
pad=1
|
| 87 |
+
activation=leaky
|
| 88 |
+
|
| 89 |
+
[shortcut]
|
| 90 |
+
from=-3
|
| 91 |
+
activation=linear
|
| 92 |
+
|
| 93 |
+
[convolutional]
|
| 94 |
+
batch_normalize=1
|
| 95 |
+
filters=64
|
| 96 |
+
size=1
|
| 97 |
+
stride=1
|
| 98 |
+
pad=1
|
| 99 |
+
activation=leaky
|
| 100 |
+
|
| 101 |
+
[convolutional]
|
| 102 |
+
batch_normalize=1
|
| 103 |
+
filters=128
|
| 104 |
+
size=3
|
| 105 |
+
stride=1
|
| 106 |
+
pad=1
|
| 107 |
+
activation=leaky
|
| 108 |
+
|
| 109 |
+
[shortcut]
|
| 110 |
+
from=-3
|
| 111 |
+
activation=linear
|
| 112 |
+
|
| 113 |
+
# Downsample
|
| 114 |
+
|
| 115 |
+
[convolutional]
|
| 116 |
+
batch_normalize=1
|
| 117 |
+
filters=256
|
| 118 |
+
size=3
|
| 119 |
+
stride=2
|
| 120 |
+
pad=1
|
| 121 |
+
activation=leaky
|
| 122 |
+
|
| 123 |
+
[convolutional]
|
| 124 |
+
batch_normalize=1
|
| 125 |
+
filters=128
|
| 126 |
+
size=1
|
| 127 |
+
stride=1
|
| 128 |
+
pad=1
|
| 129 |
+
activation=leaky
|
| 130 |
+
|
| 131 |
+
[convolutional]
|
| 132 |
+
batch_normalize=1
|
| 133 |
+
filters=256
|
| 134 |
+
size=3
|
| 135 |
+
stride=1
|
| 136 |
+
pad=1
|
| 137 |
+
activation=leaky
|
| 138 |
+
|
| 139 |
+
[shortcut]
|
| 140 |
+
from=-3
|
| 141 |
+
activation=linear
|
| 142 |
+
|
| 143 |
+
[convolutional]
|
| 144 |
+
batch_normalize=1
|
| 145 |
+
filters=128
|
| 146 |
+
size=1
|
| 147 |
+
stride=1
|
| 148 |
+
pad=1
|
| 149 |
+
activation=leaky
|
| 150 |
+
|
| 151 |
+
[convolutional]
|
| 152 |
+
batch_normalize=1
|
| 153 |
+
filters=256
|
| 154 |
+
size=3
|
| 155 |
+
stride=1
|
| 156 |
+
pad=1
|
| 157 |
+
activation=leaky
|
| 158 |
+
|
| 159 |
+
[shortcut]
|
| 160 |
+
from=-3
|
| 161 |
+
activation=linear
|
| 162 |
+
|
| 163 |
+
[convolutional]
|
| 164 |
+
batch_normalize=1
|
| 165 |
+
filters=128
|
| 166 |
+
size=1
|
| 167 |
+
stride=1
|
| 168 |
+
pad=1
|
| 169 |
+
activation=leaky
|
| 170 |
+
|
| 171 |
+
[convolutional]
|
| 172 |
+
batch_normalize=1
|
| 173 |
+
filters=256
|
| 174 |
+
size=3
|
| 175 |
+
stride=1
|
| 176 |
+
pad=1
|
| 177 |
+
activation=leaky
|
| 178 |
+
|
| 179 |
+
[shortcut]
|
| 180 |
+
from=-3
|
| 181 |
+
activation=linear
|
| 182 |
+
|
| 183 |
+
[convolutional]
|
| 184 |
+
batch_normalize=1
|
| 185 |
+
filters=128
|
| 186 |
+
size=1
|
| 187 |
+
stride=1
|
| 188 |
+
pad=1
|
| 189 |
+
activation=leaky
|
| 190 |
+
|
| 191 |
+
[convolutional]
|
| 192 |
+
batch_normalize=1
|
| 193 |
+
filters=256
|
| 194 |
+
size=3
|
| 195 |
+
stride=1
|
| 196 |
+
pad=1
|
| 197 |
+
activation=leaky
|
| 198 |
+
|
| 199 |
+
[shortcut]
|
| 200 |
+
from=-3
|
| 201 |
+
activation=linear
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
[convolutional]
|
| 205 |
+
batch_normalize=1
|
| 206 |
+
filters=128
|
| 207 |
+
size=1
|
| 208 |
+
stride=1
|
| 209 |
+
pad=1
|
| 210 |
+
activation=leaky
|
| 211 |
+
|
| 212 |
+
[convolutional]
|
| 213 |
+
batch_normalize=1
|
| 214 |
+
filters=256
|
| 215 |
+
size=3
|
| 216 |
+
stride=1
|
| 217 |
+
pad=1
|
| 218 |
+
activation=leaky
|
| 219 |
+
|
| 220 |
+
[shortcut]
|
| 221 |
+
from=-3
|
| 222 |
+
activation=linear
|
| 223 |
+
|
| 224 |
+
[convolutional]
|
| 225 |
+
batch_normalize=1
|
| 226 |
+
filters=128
|
| 227 |
+
size=1
|
| 228 |
+
stride=1
|
| 229 |
+
pad=1
|
| 230 |
+
activation=leaky
|
| 231 |
+
|
| 232 |
+
[convolutional]
|
| 233 |
+
batch_normalize=1
|
| 234 |
+
filters=256
|
| 235 |
+
size=3
|
| 236 |
+
stride=1
|
| 237 |
+
pad=1
|
| 238 |
+
activation=leaky
|
| 239 |
+
|
| 240 |
+
[shortcut]
|
| 241 |
+
from=-3
|
| 242 |
+
activation=linear
|
| 243 |
+
|
| 244 |
+
[convolutional]
|
| 245 |
+
batch_normalize=1
|
| 246 |
+
filters=128
|
| 247 |
+
size=1
|
| 248 |
+
stride=1
|
| 249 |
+
pad=1
|
| 250 |
+
activation=leaky
|
| 251 |
+
|
| 252 |
+
[convolutional]
|
| 253 |
+
batch_normalize=1
|
| 254 |
+
filters=256
|
| 255 |
+
size=3
|
| 256 |
+
stride=1
|
| 257 |
+
pad=1
|
| 258 |
+
activation=leaky
|
| 259 |
+
|
| 260 |
+
[shortcut]
|
| 261 |
+
from=-3
|
| 262 |
+
activation=linear
|
| 263 |
+
|
| 264 |
+
[convolutional]
|
| 265 |
+
batch_normalize=1
|
| 266 |
+
filters=128
|
| 267 |
+
size=1
|
| 268 |
+
stride=1
|
| 269 |
+
pad=1
|
| 270 |
+
activation=leaky
|
| 271 |
+
|
| 272 |
+
[convolutional]
|
| 273 |
+
batch_normalize=1
|
| 274 |
+
filters=256
|
| 275 |
+
size=3
|
| 276 |
+
stride=1
|
| 277 |
+
pad=1
|
| 278 |
+
activation=leaky
|
| 279 |
+
|
| 280 |
+
[shortcut]
|
| 281 |
+
from=-3
|
| 282 |
+
activation=linear
|
| 283 |
+
|
| 284 |
+
# Downsample
|
| 285 |
+
|
| 286 |
+
[convolutional]
|
| 287 |
+
batch_normalize=1
|
| 288 |
+
filters=512
|
| 289 |
+
size=3
|
| 290 |
+
stride=2
|
| 291 |
+
pad=1
|
| 292 |
+
activation=leaky
|
| 293 |
+
|
| 294 |
+
[convolutional]
|
| 295 |
+
batch_normalize=1
|
| 296 |
+
filters=256
|
| 297 |
+
size=1
|
| 298 |
+
stride=1
|
| 299 |
+
pad=1
|
| 300 |
+
activation=leaky
|
| 301 |
+
|
| 302 |
+
[convolutional]
|
| 303 |
+
batch_normalize=1
|
| 304 |
+
filters=512
|
| 305 |
+
size=3
|
| 306 |
+
stride=1
|
| 307 |
+
pad=1
|
| 308 |
+
activation=leaky
|
| 309 |
+
|
| 310 |
+
[shortcut]
|
| 311 |
+
from=-3
|
| 312 |
+
activation=linear
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
[convolutional]
|
| 316 |
+
batch_normalize=1
|
| 317 |
+
filters=256
|
| 318 |
+
size=1
|
| 319 |
+
stride=1
|
| 320 |
+
pad=1
|
| 321 |
+
activation=leaky
|
| 322 |
+
|
| 323 |
+
[convolutional]
|
| 324 |
+
batch_normalize=1
|
| 325 |
+
filters=512
|
| 326 |
+
size=3
|
| 327 |
+
stride=1
|
| 328 |
+
pad=1
|
| 329 |
+
activation=leaky
|
| 330 |
+
|
| 331 |
+
[shortcut]
|
| 332 |
+
from=-3
|
| 333 |
+
activation=linear
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
[convolutional]
|
| 337 |
+
batch_normalize=1
|
| 338 |
+
filters=256
|
| 339 |
+
size=1
|
| 340 |
+
stride=1
|
| 341 |
+
pad=1
|
| 342 |
+
activation=leaky
|
| 343 |
+
|
| 344 |
+
[convolutional]
|
| 345 |
+
batch_normalize=1
|
| 346 |
+
filters=512
|
| 347 |
+
size=3
|
| 348 |
+
stride=1
|
| 349 |
+
pad=1
|
| 350 |
+
activation=leaky
|
| 351 |
+
|
| 352 |
+
[shortcut]
|
| 353 |
+
from=-3
|
| 354 |
+
activation=linear
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
[convolutional]
|
| 358 |
+
batch_normalize=1
|
| 359 |
+
filters=256
|
| 360 |
+
size=1
|
| 361 |
+
stride=1
|
| 362 |
+
pad=1
|
| 363 |
+
activation=leaky
|
| 364 |
+
|
| 365 |
+
[convolutional]
|
| 366 |
+
batch_normalize=1
|
| 367 |
+
filters=512
|
| 368 |
+
size=3
|
| 369 |
+
stride=1
|
| 370 |
+
pad=1
|
| 371 |
+
activation=leaky
|
| 372 |
+
|
| 373 |
+
[shortcut]
|
| 374 |
+
from=-3
|
| 375 |
+
activation=linear
|
| 376 |
+
|
| 377 |
+
[convolutional]
|
| 378 |
+
batch_normalize=1
|
| 379 |
+
filters=256
|
| 380 |
+
size=1
|
| 381 |
+
stride=1
|
| 382 |
+
pad=1
|
| 383 |
+
activation=leaky
|
| 384 |
+
|
| 385 |
+
[convolutional]
|
| 386 |
+
batch_normalize=1
|
| 387 |
+
filters=512
|
| 388 |
+
size=3
|
| 389 |
+
stride=1
|
| 390 |
+
pad=1
|
| 391 |
+
activation=leaky
|
| 392 |
+
|
| 393 |
+
[shortcut]
|
| 394 |
+
from=-3
|
| 395 |
+
activation=linear
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
[convolutional]
|
| 399 |
+
batch_normalize=1
|
| 400 |
+
filters=256
|
| 401 |
+
size=1
|
| 402 |
+
stride=1
|
| 403 |
+
pad=1
|
| 404 |
+
activation=leaky
|
| 405 |
+
|
| 406 |
+
[convolutional]
|
| 407 |
+
batch_normalize=1
|
| 408 |
+
filters=512
|
| 409 |
+
size=3
|
| 410 |
+
stride=1
|
| 411 |
+
pad=1
|
| 412 |
+
activation=leaky
|
| 413 |
+
|
| 414 |
+
[shortcut]
|
| 415 |
+
from=-3
|
| 416 |
+
activation=linear
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
[convolutional]
|
| 420 |
+
batch_normalize=1
|
| 421 |
+
filters=256
|
| 422 |
+
size=1
|
| 423 |
+
stride=1
|
| 424 |
+
pad=1
|
| 425 |
+
activation=leaky
|
| 426 |
+
|
| 427 |
+
[convolutional]
|
| 428 |
+
batch_normalize=1
|
| 429 |
+
filters=512
|
| 430 |
+
size=3
|
| 431 |
+
stride=1
|
| 432 |
+
pad=1
|
| 433 |
+
activation=leaky
|
| 434 |
+
|
| 435 |
+
[shortcut]
|
| 436 |
+
from=-3
|
| 437 |
+
activation=linear
|
| 438 |
+
|
| 439 |
+
[convolutional]
|
| 440 |
+
batch_normalize=1
|
| 441 |
+
filters=256
|
| 442 |
+
size=1
|
| 443 |
+
stride=1
|
| 444 |
+
pad=1
|
| 445 |
+
activation=leaky
|
| 446 |
+
|
| 447 |
+
[convolutional]
|
| 448 |
+
batch_normalize=1
|
| 449 |
+
filters=512
|
| 450 |
+
size=3
|
| 451 |
+
stride=1
|
| 452 |
+
pad=1
|
| 453 |
+
activation=leaky
|
| 454 |
+
|
| 455 |
+
[shortcut]
|
| 456 |
+
from=-3
|
| 457 |
+
activation=linear
|
| 458 |
+
|
| 459 |
+
# Downsample
|
| 460 |
+
|
| 461 |
+
[convolutional]
|
| 462 |
+
batch_normalize=1
|
| 463 |
+
filters=1024
|
| 464 |
+
size=3
|
| 465 |
+
stride=2
|
| 466 |
+
pad=1
|
| 467 |
+
activation=leaky
|
| 468 |
+
|
| 469 |
+
[convolutional]
|
| 470 |
+
batch_normalize=1
|
| 471 |
+
filters=512
|
| 472 |
+
size=1
|
| 473 |
+
stride=1
|
| 474 |
+
pad=1
|
| 475 |
+
activation=leaky
|
| 476 |
+
|
| 477 |
+
[convolutional]
|
| 478 |
+
batch_normalize=1
|
| 479 |
+
filters=1024
|
| 480 |
+
size=3
|
| 481 |
+
stride=1
|
| 482 |
+
pad=1
|
| 483 |
+
activation=leaky
|
| 484 |
+
|
| 485 |
+
[shortcut]
|
| 486 |
+
from=-3
|
| 487 |
+
activation=linear
|
| 488 |
+
|
| 489 |
+
[convolutional]
|
| 490 |
+
batch_normalize=1
|
| 491 |
+
filters=512
|
| 492 |
+
size=1
|
| 493 |
+
stride=1
|
| 494 |
+
pad=1
|
| 495 |
+
activation=leaky
|
| 496 |
+
|
| 497 |
+
[convolutional]
|
| 498 |
+
batch_normalize=1
|
| 499 |
+
filters=1024
|
| 500 |
+
size=3
|
| 501 |
+
stride=1
|
| 502 |
+
pad=1
|
| 503 |
+
activation=leaky
|
| 504 |
+
|
| 505 |
+
[shortcut]
|
| 506 |
+
from=-3
|
| 507 |
+
activation=linear
|
| 508 |
+
|
| 509 |
+
[convolutional]
|
| 510 |
+
batch_normalize=1
|
| 511 |
+
filters=512
|
| 512 |
+
size=1
|
| 513 |
+
stride=1
|
| 514 |
+
pad=1
|
| 515 |
+
activation=leaky
|
| 516 |
+
|
| 517 |
+
[convolutional]
|
| 518 |
+
batch_normalize=1
|
| 519 |
+
filters=1024
|
| 520 |
+
size=3
|
| 521 |
+
stride=1
|
| 522 |
+
pad=1
|
| 523 |
+
activation=leaky
|
| 524 |
+
|
| 525 |
+
[shortcut]
|
| 526 |
+
from=-3
|
| 527 |
+
activation=linear
|
| 528 |
+
|
| 529 |
+
[convolutional]
|
| 530 |
+
batch_normalize=1
|
| 531 |
+
filters=512
|
| 532 |
+
size=1
|
| 533 |
+
stride=1
|
| 534 |
+
pad=1
|
| 535 |
+
activation=leaky
|
| 536 |
+
|
| 537 |
+
[convolutional]
|
| 538 |
+
batch_normalize=1
|
| 539 |
+
filters=1024
|
| 540 |
+
size=3
|
| 541 |
+
stride=1
|
| 542 |
+
pad=1
|
| 543 |
+
activation=leaky
|
| 544 |
+
|
| 545 |
+
[shortcut]
|
| 546 |
+
from=-3
|
| 547 |
+
activation=linear
|
| 548 |
+
|
| 549 |
+
######################
|
| 550 |
+
|
| 551 |
+
[convolutional]
|
| 552 |
+
batch_normalize=1
|
| 553 |
+
filters=512
|
| 554 |
+
size=1
|
| 555 |
+
stride=1
|
| 556 |
+
pad=1
|
| 557 |
+
activation=leaky
|
| 558 |
+
|
| 559 |
+
[convolutional]
|
| 560 |
+
batch_normalize=1
|
| 561 |
+
size=3
|
| 562 |
+
stride=1
|
| 563 |
+
pad=1
|
| 564 |
+
filters=1024
|
| 565 |
+
activation=leaky
|
| 566 |
+
|
| 567 |
+
[convolutional]
|
| 568 |
+
batch_normalize=1
|
| 569 |
+
filters=512
|
| 570 |
+
size=1
|
| 571 |
+
stride=1
|
| 572 |
+
pad=1
|
| 573 |
+
activation=leaky
|
| 574 |
+
|
| 575 |
+
[convolutional]
|
| 576 |
+
batch_normalize=1
|
| 577 |
+
size=3
|
| 578 |
+
stride=1
|
| 579 |
+
pad=1
|
| 580 |
+
filters=1024
|
| 581 |
+
activation=leaky
|
| 582 |
+
|
| 583 |
+
[convolutional]
|
| 584 |
+
batch_normalize=1
|
| 585 |
+
filters=512
|
| 586 |
+
size=1
|
| 587 |
+
stride=1
|
| 588 |
+
pad=1
|
| 589 |
+
activation=leaky
|
| 590 |
+
|
| 591 |
+
[convolutional]
|
| 592 |
+
batch_normalize=1
|
| 593 |
+
size=3
|
| 594 |
+
stride=1
|
| 595 |
+
pad=1
|
| 596 |
+
filters=1024
|
| 597 |
+
activation=leaky
|
| 598 |
+
|
| 599 |
+
[convolutional]
|
| 600 |
+
size=1
|
| 601 |
+
stride=1
|
| 602 |
+
pad=1
|
| 603 |
+
filters=255
|
| 604 |
+
activation=linear
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
[yolo]
|
| 608 |
+
mask = 6,7,8
|
| 609 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
| 610 |
+
classes=80
|
| 611 |
+
num=9
|
| 612 |
+
jitter=.3
|
| 613 |
+
ignore_thresh = .7
|
| 614 |
+
truth_thresh = 1
|
| 615 |
+
random=1
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
[route]
|
| 619 |
+
layers = -4
|
| 620 |
+
|
| 621 |
+
[convolutional]
|
| 622 |
+
batch_normalize=1
|
| 623 |
+
filters=256
|
| 624 |
+
size=1
|
| 625 |
+
stride=1
|
| 626 |
+
pad=1
|
| 627 |
+
activation=leaky
|
| 628 |
+
|
| 629 |
+
[upsample]
|
| 630 |
+
stride=2
|
| 631 |
+
|
| 632 |
+
[route]
|
| 633 |
+
layers = -1, 61
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
[convolutional]
|
| 638 |
+
batch_normalize=1
|
| 639 |
+
filters=256
|
| 640 |
+
size=1
|
| 641 |
+
stride=1
|
| 642 |
+
pad=1
|
| 643 |
+
activation=leaky
|
| 644 |
+
|
| 645 |
+
[convolutional]
|
| 646 |
+
batch_normalize=1
|
| 647 |
+
size=3
|
| 648 |
+
stride=1
|
| 649 |
+
pad=1
|
| 650 |
+
filters=512
|
| 651 |
+
activation=leaky
|
| 652 |
+
|
| 653 |
+
[convolutional]
|
| 654 |
+
batch_normalize=1
|
| 655 |
+
filters=256
|
| 656 |
+
size=1
|
| 657 |
+
stride=1
|
| 658 |
+
pad=1
|
| 659 |
+
activation=leaky
|
| 660 |
+
|
| 661 |
+
[convolutional]
|
| 662 |
+
batch_normalize=1
|
| 663 |
+
size=3
|
| 664 |
+
stride=1
|
| 665 |
+
pad=1
|
| 666 |
+
filters=512
|
| 667 |
+
activation=leaky
|
| 668 |
+
|
| 669 |
+
[convolutional]
|
| 670 |
+
batch_normalize=1
|
| 671 |
+
filters=256
|
| 672 |
+
size=1
|
| 673 |
+
stride=1
|
| 674 |
+
pad=1
|
| 675 |
+
activation=leaky
|
| 676 |
+
|
| 677 |
+
[convolutional]
|
| 678 |
+
batch_normalize=1
|
| 679 |
+
size=3
|
| 680 |
+
stride=1
|
| 681 |
+
pad=1
|
| 682 |
+
filters=512
|
| 683 |
+
activation=leaky
|
| 684 |
+
|
| 685 |
+
[convolutional]
|
| 686 |
+
size=1
|
| 687 |
+
stride=1
|
| 688 |
+
pad=1
|
| 689 |
+
filters=255
|
| 690 |
+
activation=linear
|
| 691 |
+
|
| 692 |
+
|
| 693 |
+
[yolo]
|
| 694 |
+
mask = 3,4,5
|
| 695 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
| 696 |
+
classes=80
|
| 697 |
+
num=9
|
| 698 |
+
jitter=.3
|
| 699 |
+
ignore_thresh = .7
|
| 700 |
+
truth_thresh = 1
|
| 701 |
+
random=1
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
[route]
|
| 706 |
+
layers = -4
|
| 707 |
+
|
| 708 |
+
[convolutional]
|
| 709 |
+
batch_normalize=1
|
| 710 |
+
filters=128
|
| 711 |
+
size=1
|
| 712 |
+
stride=1
|
| 713 |
+
pad=1
|
| 714 |
+
activation=leaky
|
| 715 |
+
|
| 716 |
+
[upsample]
|
| 717 |
+
stride=2
|
| 718 |
+
|
| 719 |
+
[route]
|
| 720 |
+
layers = -1, 36
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
[convolutional]
|
| 725 |
+
batch_normalize=1
|
| 726 |
+
filters=128
|
| 727 |
+
size=1
|
| 728 |
+
stride=1
|
| 729 |
+
pad=1
|
| 730 |
+
activation=leaky
|
| 731 |
+
|
| 732 |
+
[convolutional]
|
| 733 |
+
batch_normalize=1
|
| 734 |
+
size=3
|
| 735 |
+
stride=1
|
| 736 |
+
pad=1
|
| 737 |
+
filters=256
|
| 738 |
+
activation=leaky
|
| 739 |
+
|
| 740 |
+
[convolutional]
|
| 741 |
+
batch_normalize=1
|
| 742 |
+
filters=128
|
| 743 |
+
size=1
|
| 744 |
+
stride=1
|
| 745 |
+
pad=1
|
| 746 |
+
activation=leaky
|
| 747 |
+
|
| 748 |
+
[convolutional]
|
| 749 |
+
batch_normalize=1
|
| 750 |
+
size=3
|
| 751 |
+
stride=1
|
| 752 |
+
pad=1
|
| 753 |
+
filters=256
|
| 754 |
+
activation=leaky
|
| 755 |
+
|
| 756 |
+
[convolutional]
|
| 757 |
+
batch_normalize=1
|
| 758 |
+
filters=128
|
| 759 |
+
size=1
|
| 760 |
+
stride=1
|
| 761 |
+
pad=1
|
| 762 |
+
activation=leaky
|
| 763 |
+
|
| 764 |
+
[convolutional]
|
| 765 |
+
batch_normalize=1
|
| 766 |
+
size=3
|
| 767 |
+
stride=1
|
| 768 |
+
pad=1
|
| 769 |
+
filters=256
|
| 770 |
+
activation=leaky
|
| 771 |
+
|
| 772 |
+
[convolutional]
|
| 773 |
+
size=1
|
| 774 |
+
stride=1
|
| 775 |
+
pad=1
|
| 776 |
+
filters=255
|
| 777 |
+
activation=linear
|
| 778 |
+
|
| 779 |
+
|
| 780 |
+
[yolo]
|
| 781 |
+
mask = 0,1,2
|
| 782 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
| 783 |
+
classes=80
|
| 784 |
+
num=9
|
| 785 |
+
jitter=.3
|
| 786 |
+
ignore_thresh = .7
|
| 787 |
+
truth_thresh = 1
|
| 788 |
+
random=1
|
| 789 |
+
|