WeixuanZ · DaveDuck321 · Feb 7, 2022 · Feb 7, 2022 · Feb 8, 2022 · Feb 8, 2022
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+src/__pycache__
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,7 @@ cython_debug/
 uploads/
 *.dat
 *.pickle
+#   OpenCV model files
+haarcascade_frontalface_default.xml
+*.pb
+*.pbtxt
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,23 @@
-FROM ubuntu:20.04
-LABEL version="0.1"
-LABEL description="Docker image for YACK"
-ARG DEBIAN_FRONTEND=noninteractive
+FROM python:3
+LABEL version="0.2"
+LABEL description="Docker image for yack!"
 
-RUN apt update
-RUN apt install -y python3-pip python3-dev cmake ffmpeg libsm6 libxext6 wget
-
-COPY ./requirements.txt /app/requirements.txt
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
+RUN mkdir /app/uploads
 
+COPY requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY . /app
+ADD https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb                     /app/opencv_model/
+ADD https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt             /app/opencv_model/
+ADD https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml    /app/opencv_model/
+COPY src/ /app/src/
 
 EXPOSE 8000
 CMD ["gunicorn", "--bind", ":8000", "--workers", "2", "--threads", "8", "--timeout", "0", "--pythonpath", "./src", "main:app"]
diff --git a/Makefile b/Makefile
@@ -8,17 +8,21 @@ install: download-model
 	pre-commit install
 
 download-model:
-	if [ ! -f "$(DIR)/src/dlib_shape_predictor/shape_predictor_68_face_landmarks.dat" ]; then \
-		wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2; \
-		bzip2 -d shape_predictor_68_face_landmarks.dat.bz2; \
-		mv shape_predictor_68_face_landmarks.dat $(DIR)/src/dlib_shape_predictor/; \
+	if [ ! -f "$(DIR)/opencv_model/opencv_face_detector_uint8.pb" ]; then \
+		wget https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb -P "$(DIR)/opencv_model/"; \
+	fi;
+	if [ ! -f "$(DIR)/opencv_model/opencv_face_detector.pbtxt" ]; then \
+		wget https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt -P "$(DIR)/opencv_model/"; \
+	fi;
+	if [ ! -f "$(DIR)/opencv_model/haarcascade_frontalface_default.xml" ]; then \
+		wget https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml -P "$(DIR)/opencv_model/"; \
 	fi;
 
 build:
 	docker build -t yack:latest .
 
-run: download-model build
-	docker run -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=production" -p 8000:8000 yack
+run: build
+	docker run --rm -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=development" -p 8000:8000 yack
 
 push: build
 	docker tag yack:latest $(CONTAINER_REGISTRY)/yack:latest

diff --git a/README.md b/README.md
@@ -6,11 +6,6 @@ Deepgram api is used for speech-to-text, get your key at https://deepgram.com an
 DEEPGRAM_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 ```
 
-dlib Facial Landmark Detector is used, which is available under the Boost Software License
-from https://github.com/davisking/dlib. The pretrained weights used are available
-from http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 and should be placed
-in `./src/dlib_shape_predictor/`.
-
 To start developing using Docker, simply use
 ```shell
 make run

diff --git a/src/dlib_shape_predictor/.gitfolder → opencv_model/.gitfolder b/src/dlib_shape_predictor/.gitfolder → opencv_model/.gitfolder
diff --git a/requirements.txt b/requirements.txt
@@ -2,39 +2,37 @@ aiohttp==3.8.1
 aiosignal==1.2.0
 async-timeout==4.0.2
 attrs==21.4.0
-black==21.12b0
+black==22.1.0
 cairocffi==1.3.0
 CairoSVG==2.5.2
 cffi==1.15.0
 cfgv==3.3.1
-charset-normalizer==2.0.10
+charset-normalizer==2.0.11
 click==8.0.3
 cssselect2==0.4.1
-deepgram-sdk==0.2.1
+deepgram-sdk==0.2.4
 defusedxml==0.7.1
 distlib==0.3.4
-dlib==19.22.1
 drawSvg==1.8.3
 ffmpeg-python==0.2.0
 filelock==3.4.2
 Flask==2.0.2
 frozenlist==1.3.0
 future==0.18.2
 gunicorn==20.1.0
-identify==2.4.4
+identify==2.4.8
 idna==3.3
-imageio==2.14.0
-imutils==0.5.4
+imageio==2.15.0
 itsdangerous==2.0.1
 Jinja2==3.0.3
 MarkupSafe==2.0.1
-multidict==5.2.0
+multidict==6.0.2
 mypy-extensions==0.4.3
 nodeenv==1.6.0
-numpy==1.22.1
+numpy==1.22.2
 opencv-python==4.5.5.62
 pathspec==0.9.0
-Pillow==9.0.0
+Pillow==9.0.1
 platformdirs==2.4.1
 pre-commit==2.17.0
 pycparser==2.21
@@ -43,9 +41,8 @@ PyYAML==6.0
 six==1.16.0
 tinycss2==1.1.1
 toml==0.10.2
-tomli==1.2.3
-typing_extensions==4.0.1
-virtualenv==20.13.0
+tomli==2.0.0
+virtualenv==20.13.1
 webencodings==0.5.1
 websockets==10.1
 Werkzeug==2.0.2

diff --git a/src/face_detector.py b/src/face_detector.py
@@ -1,83 +1,89 @@
 from pathlib import Path
 
+import numpy as np
 import cv2
-import dlib
-from imutils import face_utils
 
 from structures import Rect
 
 
 class FaceDetector:
-    def __init__(self):
-        self.DETECTOR = dlib.get_frontal_face_detector()
-        # self.PREDICTOR = dlib.shape_predictor(
-        #     (
-        #         Path(".")
-        #         / "src"
-        #         / "dlib_shape_predictor"
-        #         / "shape_predictor_68_face_landmarks.dat"
-        #     ).as_posix()
-        # )
-
-    @staticmethod
-    def dist(a, b):
-        return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
+    def find_speaker_face(self, frame: np.ndarray) -> Rect:
+        raise NotImplementedError()
+
+
+class FaceDetectorDNN(FaceDetector):
+    def __init__(
+        self,
+        model_path=Path("opencv_model", "opencv_face_detector_uint8.pb"),
+        config_path=Path("opencv_model", "opencv_face_detector.pbtxt"),
+        detection_threshold=0.5,
+    ):
+        self.model = cv2.dnn.readNetFromTensorflow(str(model_path), str(config_path))
+        self.detection_threshold = detection_threshold
 
     def find_speaker_face(self, frame):
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        rects = self.DETECTOR(gray, 1)
-
-        # this is the default speaker face position
-        speaker_face = Rect(
-            ((frame.shape[0] // 2) - 10),
-            ((frame.shape[1] // 2) - 10),
-            ((frame.shape[0] // 2) + 10),
-            ((frame.shape[1] // 2) + 10),
+        blob = cv2.dnn.blobFromImage(
+            frame, 1.0, (300, 300), [104, 117, 123], False, False
         )
 
-        speaker_mouth_ratio = 0.0
+        self.model.setInput(blob)
+        possible_face_detections = self.model.forward()
 
         # find the minimum bounding box that contains all speakers
-        min_x = frame.shape[0]
-        max_x = 0
-        min_y = frame.shape[1]
-        max_y = 0
+        min_x, min_y = frame.shape[1], frame.shape[0]
+        max_x, max_y = 0, 0
+
+        for i in range(possible_face_detections.shape[2]):
+            face = possible_face_detections[0, 0, i]
+            if face[2] > self.detection_threshold:
+                x1 = int(face[3] * frame.shape[1])
+                y1 = int(face[4] * frame.shape[0])
+                x2 = int(face[5] * frame.shape[1])
+                y2 = int(face[6] * frame.shape[0])
+
+                min_x, min_y = min(min_x, x1), min(min_y, y1)
+                max_x, max_y = max(max_x, x2), max(max_y, y2)
+
+        if min_x > max_x or min_y > max_y:
+            # Can't find a face, default to the whole image
+            min_x, min_y = 0, 0
+            max_x, max_y = frame.shape[1], frame.shape[0]
 
-        for rect in rects:
-            # shape = self.PREDICTOR(gray, rect)
-            # shape = face_utils.shape_to_np(shape)
+        speakers_bb = Rect(
+            min_x,
+            min_y,
+            (max_x - min_x),
+            (max_y - min_y),
+        )
 
-            # mouth_open = max(
-            #     FaceDetector.dist(shape[61], shape[67]),
-            #     FaceDetector.dist(shape[62], shape[66]),
-            #     FaceDetector.dist(shape[63], shape[65]),
-            # )
-            # mouth_width = FaceDetector.dist(shape[54], shape[48])
+        return speakers_bb
 
-            (x, y, w, h) = face_utils.rect_to_bb(rect)
 
+class FaceDetectorCascade(FaceDetector):
+    def __init__(
+        self, model_path=Path("opencv_model", "haarcascade_frontalface_default.xml")
+    ):
+        self.model = cv2.CascadeClassifier(str(model_path))
+
+    def find_speaker_face(self, frame):
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        gray = cv2.equalizeHist(gray)
+
+        face_rects = self.model.detectMultiScale(gray, 1.1, 6)
+
+        # find the minimum bounding box that contains all speakers
+        min_x, min_y = frame.shape[1], frame.shape[0]
+        max_x, max_y = 0, 0
+
+        for (x, y, w, h) in face_rects:
             # extend text exclusion bounding box to include speaker
-            if x < min_x:
-                min_x = x
-            if x + w > max_x:
-                max_x = x + w
-            if y < min_y:
-                min_y = y
-            if y + h > max_y:
-                max_y = y + h
-
-            # if (mouth_open / mouth_width) > speaker_mouth_ratio:
-            #     speaker_mouth_ratio = mouth_open / mouth_width
-            #     speaker_face = Rect(
-            #         x,
-            #         y,
-            #         w,
-            #         h,
-            #     )
-
-        if min_x > max_x:
-            max_x = min_x
-            max_y = min_y
+            min_x, min_y = min(x, min_x), min(y, min_y)
+            max_x, max_y = max(x, x + w), max(y, y + h)
+
+        if min_x > max_x or min_y > max_y:
+            # Can't find a face, default to the whole image
+            min_x, min_y = 0, 0
+            max_x, max_y = frame.shape[1], frame.shape[0]
 
         speakers_bb = Rect(
             min_x,
@@ -86,21 +92,33 @@ def find_speaker_face(self, frame):
             (max_y - min_y),
         )
 
-        return speaker_face, speakers_bb
+        return speakers_bb
 
 
 if __name__ == "__main__":
     cap = cv2.VideoCapture(0)
-    face_detector = FaceDetector()
+    face_detector = FaceDetectorDNN()
 
     if not cap.isOpened():
         raise IOError("Cannot open webcam")
 
     while True:
         ret, frame = cap.read()
-        frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
-        cv2.imshow("Input", frame)
-        speaker_face, speakers_bb = face_detector.find_speaker_face(frame)
+
+        speakers_bb = face_detector.find_speaker_face(frame)
+
+        cv2.rectangle(
+            frame,
+            (speakers_bb.x, speakers_bb.y),
+            (
+                speakers_bb.x + speakers_bb.width,
+                speakers_bb.y + speakers_bb.height,
+            ),
+            (255, 0, 0),
+            2,
+        )
+
+        cv2.imshow("Face", frame)
 
         if cv2.waitKey(1) == ord("q"):
             break

diff --git a/src/main.py b/src/main.py
@@ -17,7 +17,7 @@
     url_for,
 )
 
-from face_detector import FaceDetector
+from face_detector import FaceDetector, FaceDetectorDNN
 from frame_processor import StyleTransfer
 
 from layout_generator import LayoutGenerator
@@ -30,7 +30,7 @@
 app = Flask(__name__)
 app.config["UPLOAD_FOLDER"] = (Path(".") / "uploads").resolve()
 app.config["MAX_CONTENT_LENGTH"] = 16 * 1000 * 1000  # Limit uploads to 16 MB.
-app.config["PREFERRED_URL_SCHEME"] = "https"
+app.config["PREFERRED_URL_SCHEME"] = "https" if PRODUCTION else "http"
 
 
 def pipe(
@@ -62,10 +62,7 @@ def get_key_frame_index(segment: Segment) -> None:
 def detect_speaker(face_detector: FaceDetector):
     def face_detector_func(segment: Segment) -> None:
         segment.keyframe = segment.frames[segment.keyframe_index]
-        (
-            segment.speaker_location,
-            segment.speakers_bbox,
-        ) = face_detector.find_speaker_face(segment.keyframe)
+        segment.speakers_bbox = face_detector.find_speaker_face(segment.keyframe)
 
     return face_detector_func
 
@@ -145,7 +142,7 @@ def process_video(path: str) -> str:
         with open("transcript.json", "w") as file:
             json.dump(utterances, file, indent=4)
 
-    face_detector = FaceDetector()
+    face_detector = FaceDetectorDNN()
     pipeline = pipe(
         attach_frames(video),
         get_key_frame_index,

diff --git a/src/structures.py b/src/structures.py
@@ -47,7 +47,6 @@ def __init__(
         frames: np.ndarray = None,
         keyframe_index: int = None,
         keyframe: np.ndarray = None,
-        speaker_location: Rect = None,
         speakers_bbox: Rect = None,
         image: ImageData = None,
     ):
@@ -59,6 +58,5 @@ def __init__(
         self.frames = frames
         self.keyframe_index = keyframe_index
         self.keyframe = keyframe
-        self.speaker_location = speaker_location
         self.speakers_bbox = speakers_bbox
         self.image = image