From 589734cce9848d21511bbc8640f819d79e48247b Mon Sep 17 00:00:00 2001
From: cailean <caileannn@gmail.com>
Date: Fri, 4 Oct 2024 14:36:14 +0100
Subject: [PATCH] before image cropping pre-inference

---
 src/Map.cpp    |   7 ++-
 src/Player.cpp |  26 +++++++--
 src/Player.h   |   3 +-
 src/main.cpp   |   2 +-
 src/ofApp.cpp  | 141 +++++++++++++++++++++++++++++++++----------------
 src/ofApp.h    |  12 ++++-
 6 files changed, 136 insertions(+), 55 deletions(-)

diff --git a/src/Map.cpp b/src/Map.cpp
index d3a59eb..2538f4e 100644
--- a/src/Map.cpp
+++ b/src/Map.cpp
@@ -18,9 +18,9 @@ void Map::Setup(){
         SetupTSNE();
     }
 
-    mapFbo.allocate(ofGetWindowWidth(), ofGetWindowHeight(), GL_RGB);
+    mapFbo.allocate(ofGetWindowWidth() / 2, ofGetWindowHeight(), GL_RGB);
 
-    fboImage.allocate(ofGetWindowWidth(), ofGetWindowHeight(), OF_IMAGE_COLOR);
+    fboImage.allocate(ofGetWindowWidth() / 2, ofGetWindowHeight(), OF_IMAGE_COLOR);
 
     Setup3D();
 
@@ -66,9 +66,8 @@ void Map::Draw(){
     mapFbo.end();
 
     mapFbo.readToPixels(fboPixels);
-    fboImage.setFromPixels(fboPixels);
 
-    //mapFbo.draw(0, 0);
+    fboImage.setFromPixels(fboPixels);
 }
 
 /*
diff --git a/src/Player.cpp b/src/Player.cpp
index 8d1a491..15737ff 100644
--- a/src/Player.cpp
+++ b/src/Player.cpp
@@ -44,9 +44,7 @@ ofPixels Player::GetVideoPixels(){
 void Player::SetVideo(std::string path, ofFbo &fbo){
     videoPlayer.load(path);
     videoPlayer.setFrame(800);
-    //fbo.allocate(videoPlayer.getWidth(), videoPlayer.getHeight(), GL_RGB);
-    // Just setting the video dims here for the tsne map!
-    fbo.allocate(1600, 800, GL_RGB);
+    fbo.allocate(videoPlayer.getWidth(), videoPlayer.getHeight(), GL_RGB);
 }
 
 // Sets a random frame in the active video
@@ -54,4 +52,26 @@ void Player::SetRandomFrame(){
     int randomFrame = ofRandom(0, videoPlayer.getTotalNumFrames());
     std::cout << "setting frame: " << randomFrame << std::endl;
     videoPlayer.setFrame(randomFrame);
+}
+
+void Player::SetVideoPosition(ofFbo& output_fbo){
+    int playerW = videoPlayer.getWidth();
+    int playerH = videoPlayer.getHeight();
+
+    // Calculate the scaling to fit the 2/3 width and full height area
+    float targetWidth = output_fbo.getWidth();
+    float targetHeight = output_fbo.getHeight();
+
+    float scaleX = targetWidth / playerW;
+    float scaleY = targetHeight / playerH;
+
+    // Use the larger scaling factor to ensure coverage
+    float scale = std::max(scaleX, scaleY);
+
+    // Calculate scaled dimensions
+    int scaledWidth = playerW * scale;
+    int scaledHeight = playerH * scale;
+
+    // Center the video within the FBO
+    centerPosition = glm::vec2((targetWidth - scaledWidth) / 2, (targetHeight - scaledHeight) / 2);
 }
\ No newline at end of file
diff --git a/src/Player.h b/src/Player.h
index 98baf26..aa36817 100644
--- a/src/Player.h
+++ b/src/Player.h
@@ -12,7 +12,7 @@ class Player {
     void Draw();
     void SetVideo(std::string path, ofFbo &fbo);
     ofPixels GetVideoPixels();
-    void SetVideoPosition();
+    void SetVideoPosition(ofFbo& output_fbo);
     void SetRandomFrame();
     void SetupGUI();
     void UpdateGUI();
@@ -32,6 +32,7 @@ class Player {
 
     ofFbo fbo;
     
+    
     Player();
 
 };
diff --git a/src/main.cpp b/src/main.cpp
index 7592dfd..3f8924d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,7 +6,7 @@ int main( ){
 
 	//Use ofGLFWWindowSettings for more options like multi-monitor fullscreen
 	ofGLWindowSettings settings;
-	settings.setSize(1600, 800);
+	settings.setSize(1920, 1080);
 	settings.setGLVersion(3, 2);
 	settings.windowMode = OF_WINDOW; //can also be OF_FULLSCREEN
 
diff --git a/src/ofApp.cpp b/src/ofApp.cpp
index 26084b0..0ad4c1b 100644
--- a/src/ofApp.cpp
+++ b/src/ofApp.cpp
@@ -2,33 +2,47 @@
 
 //--------------------------------------------------------------
 void ofApp::setup(){
+    /* ofSettings */
     ofDisableArbTex();
-    ofSetFrameRate(60);
-    // ofSetVerticalSync(true);
+    ofSetFrameRate(24);
+    ofSetVerticalSync(true);
+    window_width = ofGetWindowWidth();
+    window_height = ofGetWindowHeight();
 
+    /* load font */
     tf.load("data/fonts/jetbrainsmono-regular.ttf", 20);
 
-    map.Setup();
+    /* load shader */
+    depthToColourShader.load("data/shader/rampShader.vert", "data/shader/rampShader.frag");
 
-    //player.Setup();
-    //player.SetVideo("videos/demo.mp4", fbo);
+    /* setup map */
+    map.Setup();
 
-    emoteImage.allocate(260, 260);
-    tempImage.allocate(emoteImage.getWidth(), emoteImage.getHeight(), OF_IMAGE_COLOR);
+    /* setup video */
+    player.Setup();
+    player.SetVideo("videos/demo.mp4", model_output_fbo_1);
 
+    /* setup models (modelPath, log, useCuda)   */
     ORTCHAR_T* modelPath = "/home/cailean/Desktop/openframeworks/of_v0.12.0_linux64gcc6_release/apps/myApps/onnx-test/bin/data/depth_anything_v2_vitb.onnx";
     ORTCHAR_T* modelPath2 = "/home/cailean/Desktop/openframeworks/of_v0.12.0_linux64gcc6_release/apps/myApps/onnx-test/bin/data/yolov5s-face.onnx";
     ORTCHAR_T* modelPath3 = "/home/cailean/Desktop/openframeworks/of_v0.12.0_linux64gcc6_release/apps/myApps/onnx-test/bin/data/rgb_emotion.onnx";
+    ORTCHAR_T* modelPath4 = "/home/cailean/Desktop/openframeworks/of_v0.12.0_linux64gcc6_release/apps/myApps/onnx-test/bin/data/depth_anything_v2_vits.onnx";
 
-    /* Setup Models (modelPath, log, useCuda)   */
     yolo.Setup(modelPath2, false, true);
     depth.Setup(modelPath, false, true);
+    depth_small.Setup(modelPath4, false, true);
     emotion.Setup(modelPath3, false, true);
 
-    /* Load shader, allocated rampedFbo */
-    depthToColourShader.load("data/shader/rampShader.vert", "data/shader/rampShader.frag");
-    fbo.allocate(1600, 800, GL_RGB);
-    rampedFbo.allocate(1600, 800);
+    /* Depth output fbo */
+    model_output_fbo.allocate(window_width / 2, window_height, GL_RGB);
+
+    /* Shader output */
+    rampedFbo.allocate(window_width, window_height);
+
+    emoteImage.allocate(260, 260);
+    tempImage.allocate(emoteImage.getWidth(), emoteImage.getHeight(), OF_IMAGE_COLOR);
+
+    screen_fbo.allocate(window_width, window_height, GL_RGB);
 }
 
 
@@ -37,7 +51,6 @@ void ofApp::setup(){
 void ofApp::update(){
     /*  Check to see if the application has moved to the first frame    
         As the models need to load first, as the first inference is quite slow  */ 
-    auto start = std::chrono::high_resolution_clock::now();
     if(ofGetFrameNum() > 0)
         firstRun = false;
 
@@ -52,35 +65,29 @@ void ofApp::update(){
     }
     
     /* Setup model input using ofImage, allocated fbo */
-    //player.Update(img);
-    //img.setFromPixels(player.GetVideoPixels());
-
+    player.Update(img);
+    img.setFromPixels(player.GetVideoPixels());
+    
     /* Run Models */
     try{
 
-        
-        auto output_tensors = depth.Run(map.fboImage);
-        float* output_ptr = output_tensors.front().GetTensorMutableData<float>();
-        size_t num_elements = output_tensors.front().GetTensorTypeAndShapeInfo().GetElementCount();
-
-        float min_value = depth.ReduceMin(output_ptr, num_elements);
-        float max_value = depth.ReduceMax(output_ptr, num_elements);
-
-        depth.Normalize(output_ptr, num_elements, min_value, max_value);
-
-        depth.DataToFbo(output_ptr, 518, 518, fbo);
+        // map
+        inferDepthImage(model_output_fbo, map.fboImage, depth);
 
-        auto output_tensors_face = yolo.Run(map.fboImage);
+        // video player 
+        inferDepthImage(model_output_fbo_1, img, depth_small);
+        
+        // auto output_tensors_face = yolo.Run(model_input_img);
 
-        auto output_faces = output_tensors_face.front().GetTensorTypeAndShapeInfo().GetShape();
+        // auto output_faces = output_tensors_face.front().GetTensorTypeAndShapeInfo().GetShape();
 
-        unsigned int num_anchors = output_faces[1];  // Number of anchors
+        // unsigned int num_anchors = output_faces[1];  // Number of anchors
 
-        float* output_face_ptr = output_tensors_face.front().GetTensorMutableData<float>();
+        // float* output_face_ptr = output_tensors_face.front().GetTensorMutableData<float>();
 
-        faceDetector.ParseOutput(output_face_ptr, detected_faces, num_anchors);
+        // faceDetector.ParseOutput(output_face_ptr, detected_faces, num_anchors);
 
-        faceDetector.ConvertBoxCoordsToOriginalSize(detected_faces, fbo.getWidth(), fbo.getHeight());
+        // faceDetector.ConvertBoxCoordsToOriginalSize(detected_faces, outFbo.getWidth(), outFbo.getHeight());
 
         /* As no input is generated for the emotion recognition model, run a dummy vector through the model
             So it can load */
@@ -112,30 +119,59 @@ void ofApp::update(){
         std::cout << "Model did not run" << std::endl;
 
     }
-
-    auto end = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<float> duration = end - start;
-    std::cout << "Time taken for Update: " << duration.count() << " seconds" << std::endl;
-
 }
 
 
 //--------------------------------------------------------------
 void ofApp::draw(){
     map.Draw();
+ 
+    screen_fbo.begin();
+
+    // Calculate the target width and height for model_output_fbo_1
+    float fbo_1_target_width = window_width * 0.5;  // 1/2 of the screen width (990px)
+    float fbo_1_target_height = window_height;  // Full height of the screen
+
+    // Calculate the aspect ratio of the video and the FBO
+    float video_aspect_ratio = model_output_fbo_1.getWidth() / model_output_fbo_1.getHeight();
+    float fbo_aspect_ratio = fbo_1_target_width / fbo_1_target_height;
+
+    // Adjust the scaling to cover the FBO area while maintaining aspect ratio
+    float new_width, new_height;
+    if (fbo_aspect_ratio > video_aspect_ratio) {
+        // FBO is wider; scale by width to fill the FBO
+        new_width = fbo_1_target_width;
+        new_height = new_width / video_aspect_ratio;  // Scale height to maintain aspect ratio
+    } else {
+        // FBO is taller; scale by height to fill the FBO
+        new_height = fbo_1_target_height;
+        new_width = new_height * video_aspect_ratio;  // Scale width to maintain aspect ratio
+    }
+
+    // Center the video to ensure it fills the FBO and is cropped if necessary
+    float x_pos = (window_width * 0.75) - (new_width / 2);
+    float y_pos = (window_height - new_height) / 2;  // Center vertically
+
+    // Draw the scaled video inside the FBO
+    model_output_fbo_1.draw(x_pos, y_pos, new_width, new_height);
+
+    model_output_fbo.draw(0, 0);
+
+    screen_fbo.end();    
 
     renderDepthMap();
 
-    if(!firstRun && detected_faces.size() != 0){
-        faceDetector.DrawBox(detected_faces);
-        faceDetector.DrawCenter(detected_faces);
-    }
+    // if(!firstRun && detected_faces.size() != 0){
+    //     faceDetector.DrawBox(detected_faces);
+    //     faceDetector.DrawCenter(detected_faces);
+    // }
 
     ofPushMatrix();
     ofSetColor(255);
     ofSetBackgroundColor(0);
     tf.drawString(std::to_string(ofGetFrameRate()), 10, 30);
     ofPopMatrix();
+    
 //   emoteImage.draw(640, 0);
 //   for(auto& face : detected_faces){
 //     ofDrawBitmapString(std::to_string(face.box.emotional_state.emotions[0]), 700, 300);
@@ -143,6 +179,19 @@ void ofApp::draw(){
 
 }
 
+void ofApp::inferDepthImage(ofFbo& fbo, ofImage& img, Onnx& model){
+    auto output_tensors = model.Run(img);
+    float* output_ptr = output_tensors.front().GetTensorMutableData<float>();
+    size_t num_elements = output_tensors.front().GetTensorTypeAndShapeInfo().GetElementCount();
+
+    float min_value = model.ReduceMin(output_ptr, num_elements);
+    float max_value = model.ReduceMax(output_ptr, num_elements);
+
+    model.Normalize(output_ptr, num_elements, min_value, max_value);
+
+    model.DataToFbo(output_ptr, 518, 518, fbo);
+}
+
 //--------------------------------------------------------------
 void ofApp::inferEmotionalState(){
 
@@ -184,15 +233,17 @@ void ofApp::inferEmotionalState(){
     }
 }
 
+/*
+    Depth Map Shader Pass
+*/
 void ofApp::renderDepthMap(){
     rampedFbo.begin();
 
     depthToColourShader.begin();
-    depthToColourShader.setUniformTexture("tex0", fbo.getTexture(), 0);
-    depthToColourShader.setUniformTexture("tex1", map.fboImage.getTexture(), 1);
+    depthToColourShader.setUniformTexture("tex0", screen_fbo.getTexture(), 0);
     depthToColourShader.setUniform1f("texW", rampedFbo.getWidth());
     depthToColourShader.setUniform1f("texH", rampedFbo.getHeight());
-    fbo.draw(0, 0);
+    screen_fbo.draw(0, 0);
     depthToColourShader.end();
 
     rampedFbo.end();
diff --git a/src/ofApp.h b/src/ofApp.h
index df45979..037e908 100644
--- a/src/ofApp.h
+++ b/src/ofApp.h
@@ -31,15 +31,20 @@ class ofApp : public ofBaseApp{
 		void gotMessage(ofMessage msg);
 		void inferEmotionalState();
 		void renderDepthMap();
+		void inferDepthImage(ofFbo& fbo, ofImage& img, Onnx& model);
+
+		float window_height;
+		float window_width;
 
 		ofImage img;
-		ofFbo fbo;
+		
 		cv::Mat cvImg;
 		ofVideoGrabber webcam;
 		Player player;
 		bool firstRun = true;
 
 		Onnx depth;
+		Onnx depth_small;
 		Onnx yolo;
 		Onnx emotion;
 		ofxCvColorImage emoteImage;
@@ -57,4 +62,9 @@ class ofApp : public ofBaseApp{
 		ofFbo rampedFbo;
 
 		ofTrueTypeFont tf;
+
+		ofFbo video_player_fbo;
+		ofFbo model_output_fbo;
+		ofFbo model_output_fbo_1;
+		ofFbo screen_fbo;
 };