diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..550d67d --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +src/__pycache__ diff --git a/.gitignore b/.gitignore index e4b71a8..5426bfe 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,7 @@ cython_debug/ uploads/ *.dat *.pickle +# OpenCV model files +haarcascade_frontalface_default.xml +*.pb +*.pbtxt diff --git a/Dockerfile b/Dockerfile index 4181a24..90bf9db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,23 @@ -FROM ubuntu:20.04 -LABEL version="0.1" -LABEL description="Docker image for YACK" -ARG DEBIAN_FRONTEND=noninteractive +FROM python:3 +LABEL version="0.2" +LABEL description="Docker image for yack!" -RUN apt update -RUN apt install -y python3-pip python3-dev cmake ffmpeg libsm6 libxext6 wget - -COPY ./requirements.txt /app/requirements.txt +RUN apt-get update && apt-get install -y \ + ffmpeg \ + libsm6 \ + libxext6 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app +RUN mkdir /app/uploads +COPY requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt -COPY . /app +ADD https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb /app/opencv_model/ +ADD https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt /app/opencv_model/ +ADD https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml /app/opencv_model/ +COPY src/ /app/src/ EXPOSE 8000 CMD ["gunicorn", "--bind", ":8000", "--workers", "2", "--threads", "8", "--timeout", "0", "--pythonpath", "./src", "main:app"] diff --git a/Makefile b/Makefile index daa1c93..a271bfe 100644 --- a/Makefile +++ b/Makefile @@ -8,17 +8,21 @@ install: download-model pre-commit install download-model: - if [ ! -f "$(DIR)/src/dlib_shape_predictor/shape_predictor_68_face_landmarks.dat" ]; then \ - wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2; \ - bzip2 -d shape_predictor_68_face_landmarks.dat.bz2; \ - mv shape_predictor_68_face_landmarks.dat $(DIR)/src/dlib_shape_predictor/; \ + if [ ! -f "$(DIR)/opencv_model/opencv_face_detector_uint8.pb" ]; then \ + wget https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb -P "$(DIR)/opencv_model/"; \ + fi; + if [ ! -f "$(DIR)/opencv_model/opencv_face_detector.pbtxt" ]; then \ + wget https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt -P "$(DIR)/opencv_model/"; \ + fi; + if [ ! -f "$(DIR)/opencv_model/haarcascade_frontalface_default.xml" ]; then \ + wget https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml -P "$(DIR)/opencv_model/"; \ fi; build: docker build -t yack:latest . -run: download-model build - docker run -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=production" -p 8000:8000 yack +run: build + docker run --rm -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=development" -p 8000:8000 yack push: build docker tag yack:latest $(CONTAINER_REGISTRY)/yack:latest diff --git a/README.md b/README.md index 83d1795..f93215d 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,6 @@ Deepgram api is used for speech-to-text, get your key at https://deepgram.com an DEEPGRAM_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ``` -dlib Facial Landmark Detector is used, which is available under the Boost Software License -from https://github.com/davisking/dlib. The pretrained weights used are available -from http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 and should be placed -in `./src/dlib_shape_predictor/`. - To start developing using Docker, simply use ```shell make run diff --git a/src/dlib_shape_predictor/.gitfolder b/opencv_model/.gitfolder similarity index 100% rename from src/dlib_shape_predictor/.gitfolder rename to opencv_model/.gitfolder diff --git a/requirements.txt b/requirements.txt index b40b0fc..0dba8f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,18 +2,17 @@ aiohttp==3.8.1 aiosignal==1.2.0 async-timeout==4.0.2 attrs==21.4.0 -black==21.12b0 +black==22.1.0 cairocffi==1.3.0 CairoSVG==2.5.2 cffi==1.15.0 cfgv==3.3.1 -charset-normalizer==2.0.10 +charset-normalizer==2.0.11 click==8.0.3 cssselect2==0.4.1 -deepgram-sdk==0.2.1 +deepgram-sdk==0.2.4 defusedxml==0.7.1 distlib==0.3.4 -dlib==19.22.1 drawSvg==1.8.3 ffmpeg-python==0.2.0 filelock==3.4.2 @@ -21,20 +20,19 @@ Flask==2.0.2 frozenlist==1.3.0 future==0.18.2 gunicorn==20.1.0 -identify==2.4.4 +identify==2.4.8 idna==3.3 -imageio==2.14.0 -imutils==0.5.4 +imageio==2.15.0 itsdangerous==2.0.1 Jinja2==3.0.3 MarkupSafe==2.0.1 -multidict==5.2.0 +multidict==6.0.2 mypy-extensions==0.4.3 nodeenv==1.6.0 -numpy==1.22.1 +numpy==1.22.2 opencv-python==4.5.5.62 pathspec==0.9.0 -Pillow==9.0.0 +Pillow==9.0.1 platformdirs==2.4.1 pre-commit==2.17.0 pycparser==2.21 @@ -43,9 +41,8 @@ PyYAML==6.0 six==1.16.0 tinycss2==1.1.1 toml==0.10.2 -tomli==1.2.3 -typing_extensions==4.0.1 -virtualenv==20.13.0 +tomli==2.0.0 +virtualenv==20.13.1 webencodings==0.5.1 websockets==10.1 Werkzeug==2.0.2 diff --git a/src/face_detector.py b/src/face_detector.py index 5314d18..cddf497 100644 --- a/src/face_detector.py +++ b/src/face_detector.py @@ -1,83 +1,89 @@ from pathlib import Path +import numpy as np import cv2 -import dlib -from imutils import face_utils from structures import Rect class FaceDetector: - def __init__(self): - self.DETECTOR = dlib.get_frontal_face_detector() - # self.PREDICTOR = dlib.shape_predictor( - # ( - # Path(".") - # / "src" - # / "dlib_shape_predictor" - # / "shape_predictor_68_face_landmarks.dat" - # ).as_posix() - # ) - - @staticmethod - def dist(a, b): - return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5 + def find_speaker_face(self, frame: np.ndarray) -> Rect: + raise NotImplementedError() + + +class FaceDetectorDNN(FaceDetector): + def __init__( + self, + model_path=Path("opencv_model", "opencv_face_detector_uint8.pb"), + config_path=Path("opencv_model", "opencv_face_detector.pbtxt"), + detection_threshold=0.5, + ): + self.model = cv2.dnn.readNetFromTensorflow(str(model_path), str(config_path)) + self.detection_threshold = detection_threshold def find_speaker_face(self, frame): - gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) - rects = self.DETECTOR(gray, 1) - - # this is the default speaker face position - speaker_face = Rect( - ((frame.shape[0] // 2) - 10), - ((frame.shape[1] // 2) - 10), - ((frame.shape[0] // 2) + 10), - ((frame.shape[1] // 2) + 10), + blob = cv2.dnn.blobFromImage( + frame, 1.0, (300, 300), [104, 117, 123], False, False ) - speaker_mouth_ratio = 0.0 + self.model.setInput(blob) + possible_face_detections = self.model.forward() # find the minimum bounding box that contains all speakers - min_x = frame.shape[0] - max_x = 0 - min_y = frame.shape[1] - max_y = 0 + min_x, min_y = frame.shape[1], frame.shape[0] + max_x, max_y = 0, 0 + + for i in range(possible_face_detections.shape[2]): + face = possible_face_detections[0, 0, i] + if face[2] > self.detection_threshold: + x1 = int(face[3] * frame.shape[1]) + y1 = int(face[4] * frame.shape[0]) + x2 = int(face[5] * frame.shape[1]) + y2 = int(face[6] * frame.shape[0]) + + min_x, min_y = min(min_x, x1), min(min_y, y1) + max_x, max_y = max(max_x, x2), max(max_y, y2) + + if min_x > max_x or min_y > max_y: + # Can't find a face, default to the whole image + min_x, min_y = 0, 0 + max_x, max_y = frame.shape[1], frame.shape[0] - for rect in rects: - # shape = self.PREDICTOR(gray, rect) - # shape = face_utils.shape_to_np(shape) + speakers_bb = Rect( + min_x, + min_y, + (max_x - min_x), + (max_y - min_y), + ) - # mouth_open = max( - # FaceDetector.dist(shape[61], shape[67]), - # FaceDetector.dist(shape[62], shape[66]), - # FaceDetector.dist(shape[63], shape[65]), - # ) - # mouth_width = FaceDetector.dist(shape[54], shape[48]) + return speakers_bb - (x, y, w, h) = face_utils.rect_to_bb(rect) +class FaceDetectorCascade(FaceDetector): + def __init__( + self, model_path=Path("opencv_model", "haarcascade_frontalface_default.xml") + ): + self.model = cv2.CascadeClassifier(str(model_path)) + + def find_speaker_face(self, frame): + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + gray = cv2.equalizeHist(gray) + + face_rects = self.model.detectMultiScale(gray, 1.1, 6) + + # find the minimum bounding box that contains all speakers + min_x, min_y = frame.shape[1], frame.shape[0] + max_x, max_y = 0, 0 + + for (x, y, w, h) in face_rects: # extend text exclusion bounding box to include speaker - if x < min_x: - min_x = x - if x + w > max_x: - max_x = x + w - if y < min_y: - min_y = y - if y + h > max_y: - max_y = y + h - - # if (mouth_open / mouth_width) > speaker_mouth_ratio: - # speaker_mouth_ratio = mouth_open / mouth_width - # speaker_face = Rect( - # x, - # y, - # w, - # h, - # ) - - if min_x > max_x: - max_x = min_x - max_y = min_y + min_x, min_y = min(x, min_x), min(y, min_y) + max_x, max_y = max(x, x + w), max(y, y + h) + + if min_x > max_x or min_y > max_y: + # Can't find a face, default to the whole image + min_x, min_y = 0, 0 + max_x, max_y = frame.shape[1], frame.shape[0] speakers_bb = Rect( min_x, @@ -86,21 +92,33 @@ def find_speaker_face(self, frame): (max_y - min_y), ) - return speaker_face, speakers_bb + return speakers_bb if __name__ == "__main__": cap = cv2.VideoCapture(0) - face_detector = FaceDetector() + face_detector = FaceDetectorDNN() if not cap.isOpened(): raise IOError("Cannot open webcam") while True: ret, frame = cap.read() - frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA) - cv2.imshow("Input", frame) - speaker_face, speakers_bb = face_detector.find_speaker_face(frame) + + speakers_bb = face_detector.find_speaker_face(frame) + + cv2.rectangle( + frame, + (speakers_bb.x, speakers_bb.y), + ( + speakers_bb.x + speakers_bb.width, + speakers_bb.y + speakers_bb.height, + ), + (255, 0, 0), + 2, + ) + + cv2.imshow("Face", frame) if cv2.waitKey(1) == ord("q"): break diff --git a/src/main.py b/src/main.py index b58f342..04488bc 100644 --- a/src/main.py +++ b/src/main.py @@ -17,7 +17,7 @@ url_for, ) -from face_detector import FaceDetector +from face_detector import FaceDetector, FaceDetectorDNN from frame_processor import StyleTransfer from layout_generator import LayoutGenerator @@ -30,7 +30,7 @@ app = Flask(__name__) app.config["UPLOAD_FOLDER"] = (Path(".") / "uploads").resolve() app.config["MAX_CONTENT_LENGTH"] = 16 * 1000 * 1000 # Limit uploads to 16 MB. -app.config["PREFERRED_URL_SCHEME"] = "https" +app.config["PREFERRED_URL_SCHEME"] = "https" if PRODUCTION else "http" def pipe( @@ -62,10 +62,7 @@ def get_key_frame_index(segment: Segment) -> None: def detect_speaker(face_detector: FaceDetector): def face_detector_func(segment: Segment) -> None: segment.keyframe = segment.frames[segment.keyframe_index] - ( - segment.speaker_location, - segment.speakers_bbox, - ) = face_detector.find_speaker_face(segment.keyframe) + segment.speakers_bbox = face_detector.find_speaker_face(segment.keyframe) return face_detector_func @@ -145,7 +142,7 @@ def process_video(path: str) -> str: with open("transcript.json", "w") as file: json.dump(utterances, file, indent=4) - face_detector = FaceDetector() + face_detector = FaceDetectorDNN() pipeline = pipe( attach_frames(video), get_key_frame_index, diff --git a/src/structures.py b/src/structures.py index d7c917b..7a9f399 100644 --- a/src/structures.py +++ b/src/structures.py @@ -47,7 +47,6 @@ def __init__( frames: np.ndarray = None, keyframe_index: int = None, keyframe: np.ndarray = None, - speaker_location: Rect = None, speakers_bbox: Rect = None, image: ImageData = None, ): @@ -59,6 +58,5 @@ def __init__( self.frames = frames self.keyframe_index = keyframe_index self.keyframe = keyframe - self.speaker_location = speaker_location self.speakers_bbox = speakers_bbox self.image = image diff --git a/src/transcription.py b/src/transcription.py index d03171a..94e7154 100644 --- a/src/transcription.py +++ b/src/transcription.py @@ -1,11 +1,13 @@ -import os from contextlib import suppress +from os import getenv from textwrap import wrap from deepgram import Deepgram -from dotenv import load_dotenv -load_dotenv(".secrets") +if getenv("DEEPGRAM_API_KEY") is None: + from dotenv import load_dotenv + + load_dotenv(".secrets") def delete_keys(transcript: dict, keys: list): @@ -40,7 +42,7 @@ def validate_transcript(transcript: dict): async def transcribe(audio: bytes) -> list: - dg_client = Deepgram(os.getenv("DEEPGRAM_API_KEY")) + dg_client = Deepgram(getenv("DEEPGRAM_API_KEY")) source = {"buffer": audio, "mimetype": "audio/wav"}