Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/__pycache__
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,7 @@ cython_debug/
uploads/
*.dat
*.pickle
# OpenCV model files
haarcascade_frontalface_default.xml
*.pb
*.pbtxt
23 changes: 14 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
FROM ubuntu:20.04
LABEL version="0.1"
LABEL description="Docker image for YACK"
ARG DEBIAN_FRONTEND=noninteractive
FROM python:3
LABEL version="0.2"
LABEL description="Docker image for yack!"

RUN apt update
RUN apt install -y python3-pip python3-dev cmake ffmpeg libsm6 libxext6 wget

COPY ./requirements.txt /app/requirements.txt
RUN apt-get update && apt-get install -y \
ffmpeg \
libsm6 \
libxext6 \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app
RUN mkdir /app/uploads

COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

COPY . /app
ADD https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb /app/opencv_model/
ADD https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt /app/opencv_model/
ADD https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml /app/opencv_model/
COPY src/ /app/src/

EXPOSE 8000
CMD ["gunicorn", "--bind", ":8000", "--workers", "2", "--threads", "8", "--timeout", "0", "--pythonpath", "./src", "main:app"]
16 changes: 10 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,21 @@ install: download-model
pre-commit install

download-model:
if [ ! -f "$(DIR)/src/dlib_shape_predictor/shape_predictor_68_face_landmarks.dat" ]; then \
wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2; \
bzip2 -d shape_predictor_68_face_landmarks.dat.bz2; \
mv shape_predictor_68_face_landmarks.dat $(DIR)/src/dlib_shape_predictor/; \
if [ ! -f "$(DIR)/opencv_model/opencv_face_detector_uint8.pb" ]; then \
wget https://github.com/spmallick/learnopencv/raw/master/AgeGender/opencv_face_detector_uint8.pb -P "$(DIR)/opencv_model/"; \
fi;
if [ ! -f "$(DIR)/opencv_model/opencv_face_detector.pbtxt" ]; then \
wget https://raw.githubusercontent.com/spmallick/learnopencv/master/AgeGender/opencv_face_detector.pbtxt -P "$(DIR)/opencv_model/"; \
fi;
if [ ! -f "$(DIR)/opencv_model/haarcascade_frontalface_default.xml" ]; then \
wget https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml -P "$(DIR)/opencv_model/"; \
fi;

build:
docker build -t yack:latest .

run: download-model build
docker run -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=production" -p 8000:8000 yack
run: build
docker run --rm -e "DEEPGRAM_API_KEY=$(DEEPGRAM_API_KEY)" -e "ENV=development" -p 8000:8000 yack

push: build
docker tag yack:latest $(CONTAINER_REGISTRY)/yack:latest
Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@ Deepgram api is used for speech-to-text, get your key at https://deepgram.com an
DEEPGRAM_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
```

dlib Facial Landmark Detector is used, which is available under the Boost Software License
from https://github.com/davisking/dlib. The pretrained weights used are available
from http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 and should be placed
in `./src/dlib_shape_predictor/`.

To start developing using Docker, simply use
```shell
make run
Expand Down
File renamed without changes.
23 changes: 10 additions & 13 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,37 @@ aiohttp==3.8.1
aiosignal==1.2.0
async-timeout==4.0.2
attrs==21.4.0
black==21.12b0
black==22.1.0
cairocffi==1.3.0
CairoSVG==2.5.2
cffi==1.15.0
cfgv==3.3.1
charset-normalizer==2.0.10
charset-normalizer==2.0.11
click==8.0.3
cssselect2==0.4.1
deepgram-sdk==0.2.1
deepgram-sdk==0.2.4
defusedxml==0.7.1
distlib==0.3.4
dlib==19.22.1
drawSvg==1.8.3
ffmpeg-python==0.2.0
filelock==3.4.2
Flask==2.0.2
frozenlist==1.3.0
future==0.18.2
gunicorn==20.1.0
identify==2.4.4
identify==2.4.8
idna==3.3
imageio==2.14.0
imutils==0.5.4
imageio==2.15.0
itsdangerous==2.0.1
Jinja2==3.0.3
MarkupSafe==2.0.1
multidict==5.2.0
multidict==6.0.2
mypy-extensions==0.4.3
nodeenv==1.6.0
numpy==1.22.1
numpy==1.22.2
opencv-python==4.5.5.62
pathspec==0.9.0
Pillow==9.0.0
Pillow==9.0.1
platformdirs==2.4.1
pre-commit==2.17.0
pycparser==2.21
Expand All @@ -43,9 +41,8 @@ PyYAML==6.0
six==1.16.0
tinycss2==1.1.1
toml==0.10.2
tomli==1.2.3
typing_extensions==4.0.1
virtualenv==20.13.0
tomli==2.0.0
virtualenv==20.13.1
webencodings==0.5.1
websockets==10.1
Werkzeug==2.0.2
Expand Down
150 changes: 84 additions & 66 deletions src/face_detector.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,89 @@
from pathlib import Path

import numpy as np
import cv2
import dlib
from imutils import face_utils

from structures import Rect


class FaceDetector:
def __init__(self):
self.DETECTOR = dlib.get_frontal_face_detector()
# self.PREDICTOR = dlib.shape_predictor(
# (
# Path(".")
# / "src"
# / "dlib_shape_predictor"
# / "shape_predictor_68_face_landmarks.dat"
# ).as_posix()
# )

@staticmethod
def dist(a, b):
return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5
def find_speaker_face(self, frame: np.ndarray) -> Rect:
raise NotImplementedError()


class FaceDetectorDNN(FaceDetector):
def __init__(
self,
model_path=Path("opencv_model", "opencv_face_detector_uint8.pb"),
config_path=Path("opencv_model", "opencv_face_detector.pbtxt"),
detection_threshold=0.5,
):
self.model = cv2.dnn.readNetFromTensorflow(str(model_path), str(config_path))
self.detection_threshold = detection_threshold

def find_speaker_face(self, frame):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
rects = self.DETECTOR(gray, 1)

# this is the default speaker face position
speaker_face = Rect(
((frame.shape[0] // 2) - 10),
((frame.shape[1] // 2) - 10),
((frame.shape[0] // 2) + 10),
((frame.shape[1] // 2) + 10),
blob = cv2.dnn.blobFromImage(
frame, 1.0, (300, 300), [104, 117, 123], False, False
)

speaker_mouth_ratio = 0.0
self.model.setInput(blob)
possible_face_detections = self.model.forward()

# find the minimum bounding box that contains all speakers
min_x = frame.shape[0]
max_x = 0
min_y = frame.shape[1]
max_y = 0
min_x, min_y = frame.shape[1], frame.shape[0]
max_x, max_y = 0, 0

for i in range(possible_face_detections.shape[2]):
face = possible_face_detections[0, 0, i]
if face[2] > self.detection_threshold:
x1 = int(face[3] * frame.shape[1])
y1 = int(face[4] * frame.shape[0])
x2 = int(face[5] * frame.shape[1])
y2 = int(face[6] * frame.shape[0])

min_x, min_y = min(min_x, x1), min(min_y, y1)
max_x, max_y = max(max_x, x2), max(max_y, y2)

if min_x > max_x or min_y > max_y:
# Can't find a face, default to the whole image
min_x, min_y = 0, 0
max_x, max_y = frame.shape[1], frame.shape[0]

for rect in rects:
# shape = self.PREDICTOR(gray, rect)
# shape = face_utils.shape_to_np(shape)
speakers_bb = Rect(
min_x,
min_y,
(max_x - min_x),
(max_y - min_y),
)

# mouth_open = max(
# FaceDetector.dist(shape[61], shape[67]),
# FaceDetector.dist(shape[62], shape[66]),
# FaceDetector.dist(shape[63], shape[65]),
# )
# mouth_width = FaceDetector.dist(shape[54], shape[48])
return speakers_bb

(x, y, w, h) = face_utils.rect_to_bb(rect)

class FaceDetectorCascade(FaceDetector):
def __init__(
self, model_path=Path("opencv_model", "haarcascade_frontalface_default.xml")
):
self.model = cv2.CascadeClassifier(str(model_path))

def find_speaker_face(self, frame):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
gray = cv2.equalizeHist(gray)

face_rects = self.model.detectMultiScale(gray, 1.1, 6)

# find the minimum bounding box that contains all speakers
min_x, min_y = frame.shape[1], frame.shape[0]
max_x, max_y = 0, 0

for (x, y, w, h) in face_rects:
# extend text exclusion bounding box to include speaker
if x < min_x:
min_x = x
if x + w > max_x:
max_x = x + w
if y < min_y:
min_y = y
if y + h > max_y:
max_y = y + h

# if (mouth_open / mouth_width) > speaker_mouth_ratio:
# speaker_mouth_ratio = mouth_open / mouth_width
# speaker_face = Rect(
# x,
# y,
# w,
# h,
# )

if min_x > max_x:
max_x = min_x
max_y = min_y
min_x, min_y = min(x, min_x), min(y, min_y)
max_x, max_y = max(x, x + w), max(y, y + h)

if min_x > max_x or min_y > max_y:
# Can't find a face, default to the whole image
min_x, min_y = 0, 0
max_x, max_y = frame.shape[1], frame.shape[0]

speakers_bb = Rect(
min_x,
Expand All @@ -86,21 +92,33 @@ def find_speaker_face(self, frame):
(max_y - min_y),
)

return speaker_face, speakers_bb
return speakers_bb


if __name__ == "__main__":
cap = cv2.VideoCapture(0)
face_detector = FaceDetector()
face_detector = FaceDetectorDNN()

if not cap.isOpened():
raise IOError("Cannot open webcam")

while True:
ret, frame = cap.read()
frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
cv2.imshow("Input", frame)
speaker_face, speakers_bb = face_detector.find_speaker_face(frame)

speakers_bb = face_detector.find_speaker_face(frame)

cv2.rectangle(
frame,
(speakers_bb.x, speakers_bb.y),
(
speakers_bb.x + speakers_bb.width,
speakers_bb.y + speakers_bb.height,
),
(255, 0, 0),
2,
)

cv2.imshow("Face", frame)

if cv2.waitKey(1) == ord("q"):
break
Expand Down
11 changes: 4 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
url_for,
)

from face_detector import FaceDetector
from face_detector import FaceDetector, FaceDetectorDNN
from frame_processor import StyleTransfer

from layout_generator import LayoutGenerator
Expand All @@ -30,7 +30,7 @@
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = (Path(".") / "uploads").resolve()
app.config["MAX_CONTENT_LENGTH"] = 16 * 1000 * 1000 # Limit uploads to 16 MB.
app.config["PREFERRED_URL_SCHEME"] = "https"
app.config["PREFERRED_URL_SCHEME"] = "https" if PRODUCTION else "http"


def pipe(
Expand Down Expand Up @@ -62,10 +62,7 @@ def get_key_frame_index(segment: Segment) -> None:
def detect_speaker(face_detector: FaceDetector):
def face_detector_func(segment: Segment) -> None:
segment.keyframe = segment.frames[segment.keyframe_index]
(
segment.speaker_location,
segment.speakers_bbox,
) = face_detector.find_speaker_face(segment.keyframe)
segment.speakers_bbox = face_detector.find_speaker_face(segment.keyframe)

return face_detector_func

Expand Down Expand Up @@ -145,7 +142,7 @@ def process_video(path: str) -> str:
with open("transcript.json", "w") as file:
json.dump(utterances, file, indent=4)

face_detector = FaceDetector()
face_detector = FaceDetectorDNN()
pipeline = pipe(
attach_frames(video),
get_key_frame_index,
Expand Down
2 changes: 0 additions & 2 deletions src/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def __init__(
frames: np.ndarray = None,
keyframe_index: int = None,
keyframe: np.ndarray = None,
speaker_location: Rect = None,
speakers_bbox: Rect = None,
image: ImageData = None,
):
Expand All @@ -59,6 +58,5 @@ def __init__(
self.frames = frames
self.keyframe_index = keyframe_index
self.keyframe = keyframe
self.speaker_location = speaker_location
self.speakers_bbox = speakers_bbox
self.image = image
Loading