diff --git a/.run/devserver.run.xml b/.run/devserver.run.xml
index 1c94ee6402..55b6546404 100644
--- a/.run/devserver.run.xml
+++ b/.run/devserver.run.xml
@@ -13,7 +13,7 @@
-
+
diff --git a/Makefile b/Makefile
index 002d337323..6eebcf6d54 100644
--- a/Makefile
+++ b/Makefile
@@ -200,7 +200,7 @@ dctest: .docker/minio .docker/postgres
dcservicesup: .docker/minio .docker/postgres
# launch all studio's dependent services using docker-compose
- $(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml up minio postgres redis
+ $(DOCKER_COMPOSE) -f docker-compose.yml -f docker-compose.alt.yml up minio postgres redis studio-nginx
dcservicesdown:
# stop services that were started using dcservicesup
diff --git a/contentcuration/contentcuration/management/commands/restore_channel.py b/contentcuration/contentcuration/management/commands/restore_channel.py
index 6133ec3806..16b3976228 100644
--- a/contentcuration/contentcuration/management/commands/restore_channel.py
+++ b/contentcuration/contentcuration/management/commands/restore_channel.py
@@ -2,27 +2,65 @@
from django.core.management.base import BaseCommand
-from contentcuration.utils.import_tools import import_channel
+from contentcuration.utils.import_tools import ImportManager
logger = logging.getLogger("command")
class Command(BaseCommand):
+ """
+ This command is used to restore a channel from another Studio instance. This is for
+ development purposes only and should not be used in production.
+ """
+
def add_arguments(self, parser):
# ID of channel to read data from
parser.add_argument("source_id", type=str)
# ID of channel to write data to (can be same as source channel)
- parser.add_argument("--target", help="restore channel db to TARGET CHANNEL ID")
- parser.add_argument("--download-url", help="where to download db from")
- parser.add_argument("--editor", help="add user as editor to channel")
+ parser.add_argument(
+ "--target",
+ help="A different channel ID for which to restore the channel. If not provided, the source channel ID will be used.",
+ )
+ parser.add_argument(
+ "--source-url",
+ default="http://localhost:8080",
+ help="Studio instance from which to download the channel DB or content files",
+ )
+ parser.add_argument("--token", help="API token for the Studio instance")
+ parser.add_argument(
+ "--editor",
+ default="a@a.com",
+ help="Add user as editor to channel with provided email address",
+ )
+ parser.add_argument(
+ "--download-content",
+ action="store_true",
+ default=False,
+ help="Whether to download content files",
+ )
+ parser.add_argument(
+ "--public",
+ action="store_true",
+ default=False,
+ help="Whether to make the channel public",
+ )
+ parser.add_argument(
+ "--publish",
+ action="store_true",
+ default=False,
+ help="Whether to publish the channel after restoration",
+ )
def handle(self, *args, **options):
- # Set up variables for restoration process
- logger.info("\n\n********** STARTING CHANNEL RESTORATION **********")
- source_id = options["source_id"]
- target_id = options.get("target") or source_id
- download_url = options.get("download_url")
- editor = options.get("editor")
-
- import_channel(source_id, target_id, download_url, editor, logger=logger)
+ manager = ImportManager(
+ options["source_url"],
+ options["source_id"],
+ target_id=options.get("target"),
+ editor=options.get("editor"),
+ public=options.get("public"),
+ publish=options.get("publish"),
+ token=options.get("token"),
+ download_content=options.get("download_content"),
+ )
+ manager.run()
diff --git a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
index 27af4732fc..8a79fd02f5 100755
--- a/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
+++ b/contentcuration/contentcuration/management/commands/set_content_mimetypes.py
@@ -14,7 +14,7 @@
from django.core.files.storage import default_storage
from django.core.management.base import BaseCommand
-from contentcuration.utils.storage_common import determine_content_type
+from contentcuration.utils.storage.common import determine_content_type
class Command(BaseCommand):
diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py
index 32727f0159..8aefab24ee 100644
--- a/contentcuration/contentcuration/models.py
+++ b/contentcuration/contentcuration/models.py
@@ -2,7 +2,6 @@
import json
import logging
import os
-import urllib.parse
import uuid
from datetime import datetime
@@ -671,44 +670,10 @@ def generate_storage_url(filename, request=None, *args):
path = generate_object_storage_name(os.path.splitext(filename)[0], filename)
- # There are three scenarios where Studio might be run as:
- #
- # 1. In normal kubernetes, nginx will proxy for us. We'll know we're in kubernetes when the
- # environment variable RUN_MODE=k8s
- #
- # 2. In Docker Compose and bare metal runserver, we'll be running in runserver, and minio
- # will be exposed in port 9000 in the host's localhost network.
-
- # Note (aron): returning the true storage URL (e.g. https://storage.googleapis.com/storage/a.mp4)
- # isn't too important, because we have CDN in front of our servers, so it should be cached.
- # But change the logic here in case there is a potential for bandwidth and latency improvement.
-
- # Detect our current state first
- run_mode = os.getenv("RUN_MODE")
-
- # if we're running inside k8s, then just serve the normal /content/{storage,databases} URL,
- # and let nginx handle proper proxying.
- if run_mode == "k8s":
- url = "/content/{path}".format(
- path=path,
- )
-
- # if we're in docker-compose or in baremetal, just return the object storage URL as localhost:9000
- elif run_mode == "docker-compose" or run_mode is None:
- # generate the minio storage URL, so we can get the GET parameters that give everyone
- # access even if they don't need to log in
- params = urllib.parse.urlparse(default_storage.url(path)).query
- host = "localhost"
- port = 9000 # hardcoded to the default minio IP address
- url = "http://{host}:{port}/{bucket}/{path}?{params}".format(
- host=host,
- port=port,
- bucket=settings.AWS_S3_BUCKET_NAME,
- path=path,
- params=params,
- )
-
- return url
+ # requires that we always have a proxy of /content to storage bucket, handled by nginx in dev
+ return "/content/{path}".format(
+ path=path,
+ )
class FileOnDiskStorage(FileSystemStorage):
diff --git a/contentcuration/contentcuration/production_settings.py b/contentcuration/contentcuration/production_settings.py
index a00bf43a41..82319bd85e 100644
--- a/contentcuration/contentcuration/production_settings.py
+++ b/contentcuration/contentcuration/production_settings.py
@@ -10,7 +10,7 @@
MEDIA_ROOT = base_settings.STORAGE_ROOT
-DEFAULT_FILE_STORAGE = "contentcuration.utils.gcs_storage.CompositeGCS"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.gcs.CompositeGCS"
SESSION_ENGINE = "django.contrib.sessions.backends.db"
# email settings
diff --git a/contentcuration/contentcuration/sandbox_settings.py b/contentcuration/contentcuration/sandbox_settings.py
index 61e00a465f..912fed7244 100644
--- a/contentcuration/contentcuration/sandbox_settings.py
+++ b/contentcuration/contentcuration/sandbox_settings.py
@@ -3,7 +3,7 @@
DEBUG = True
-DEFAULT_FILE_STORAGE = "contentcuration.utils.gcs_storage.CompositeGCS"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.gcs.CompositeGCS"
LANGUAGES += (("ar", gettext("Arabic")),) # noqa
diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py
index 0f18ed0131..e57064601d 100644
--- a/contentcuration/contentcuration/settings.py
+++ b/contentcuration/contentcuration/settings.py
@@ -357,7 +357,7 @@ def gettext(s):
ORPHAN_DATE_CLEAN_UP_THRESHOLD = TWO_WEEKS_AGO
# CLOUD STORAGE SETTINGS
-DEFAULT_FILE_STORAGE = "django_s3_storage.storage.S3Storage"
+DEFAULT_FILE_STORAGE = "contentcuration.utils.storage.dev.CompositeStorage"
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") or "development"
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") or "development"
AWS_S3_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") or "content"
diff --git a/contentcuration/contentcuration/tests/test_restore_channel.py b/contentcuration/contentcuration/tests/test_restore_channel.py
deleted file mode 100644
index 6c5e1500ff..0000000000
--- a/contentcuration/contentcuration/tests/test_restore_channel.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# -*- coding: utf-8 -*-
-import datetime
-import json
-import uuid
-from io import BytesIO
-
-from django.core.files.storage import default_storage
-from django.template.loader import render_to_string
-from django.utils.translation import activate
-from django.utils.translation import deactivate
-from le_utils.constants import exercises
-from mixer.backend.django import mixer
-from mock import MagicMock
-from mock import patch
-
-from .base import StudioTestCase
-from contentcuration.models import AssessmentItem
-from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.import_tools import create_channel
-from contentcuration.utils.import_tools import generate_assessment_item
-from contentcuration.utils.import_tools import process_content
-
-
-thumbnail_path = "/content/thumbnail.png"
-ASSESSMENT_DATA = {
- "input-question-test": {
- "template": "perseus/input_question.json",
- "type": exercises.INPUT_QUESTION,
- "question": "Input question",
- "question_images": [{"name": "test.jpg", "width": 12.71, "height": 12.12}],
- "hints": [{"hint": "Hint 1"}],
- "answers": [
- {"answer": "1", "correct": True, "images": []},
- {"answer": "2", "correct": True, "images": []},
- ],
- "order": 0,
- },
- "multiple-selection-test": {
- "template": "perseus/multiple_selection.json",
- "type": exercises.MULTIPLE_SELECTION,
- "question": "Multiple selection question",
- "question_images": [],
- "hints": [],
- "answers": [
- {"answer": "A", "correct": True, "images": []},
- {"answer": "B", "correct": True, "images": []},
- {"answer": "C", "correct": False, "images": []},
- ],
- "multiple_select": True,
- "order": 1,
- "randomize": False,
- },
- "single-selection-test": {
- "template": "perseus/multiple_selection.json",
- "type": exercises.SINGLE_SELECTION,
- "question": "Single select question",
- "question_images": [],
- "hints": [{"hint": "Hint test"}],
- "answers": [
- {"answer": "Correct answer", "correct": True, "images": []},
- {"answer": "Incorrect answer", "correct": False, "images": []},
- ],
- "multiple_select": False,
- "order": 2,
- "randomize": True,
- },
- "perseus-question-test": {
- "template": "perseus/perseus_question.json",
- "type": exercises.PERSEUS_QUESTION,
- "order": 3,
- "raw_data": "{}",
- },
-}
-
-
-class ChannelRestoreUtilityFunctionTestCase(StudioTestCase):
- @patch(
- "contentcuration.utils.import_tools.write_to_thumbnail_file",
- return_value=thumbnail_path,
- )
- def setUp(self, thumb_mock):
- self.id = uuid.uuid4().hex
- self.name = "test name"
- self.description = "test description"
- self.thumbnail_encoding = "base64 string"
- self.root_pk = uuid.uuid4()
- self.version = 7
- self.last_updated = datetime.datetime.now()
- self.cursor_mock = MagicMock()
- self.cursor_mock.execute.return_value.fetchone.return_value = (
- self.id,
- self.name,
- self.description,
- self.thumbnail_encoding,
- self.root_pk,
- self.version,
- self.last_updated,
- )
- self.channel, _ = create_channel(self.cursor_mock, self.id, self.admin_user)
-
- def test_restore_channel_id(self):
- self.assertEqual(self.channel.id, self.id)
-
- def test_restore_channel_name(self):
- self.assertEqual(self.channel.name, self.name)
-
- def test_restore_channel_description(self):
- self.assertEqual(self.channel.description, self.description)
-
- def test_restore_channel_thumbnail(self):
- self.assertEqual(self.channel.thumbnail, thumbnail_path)
-
- def test_restore_channel_thumbnail_encoding(self):
- self.assertEqual(
- self.channel.thumbnail_encoding["base64"], self.thumbnail_encoding
- )
-
- def test_restore_channel_version(self):
- self.assertEqual(self.channel.version, self.version)
-
-
-class PerseusRestoreTestCase(StudioTestCase):
- def setUp(self):
- super(PerseusRestoreTestCase, self).setUp()
- image_path = generate_object_storage_name("test", "test.png")
- default_storage.save(image_path, BytesIO(b"test"))
-
- def test_process_content(self):
- tests = [
- {"content": "test 1", "output": "test 1", "images": {}},
- {
- "content": "test 2 ",
- "output": "test 2 ",
- "images": {},
- },
- {
- "content": "test 3 ",
- "output": "test 3 ",
- "images": {
- "${☣ LOCALPATH}/images/test.png": {"width": 50, "height": 50}
- },
- },
- {
- "content": "test 4  ",
- "output": "test 4  ",
- "images": {},
- },
- {
- "content": "test 5 $\\sqrt{36}+\\frac{1}{2}$ ",
- "output": "test 5 $$\\sqrt{36}+\\frac{1}{2}$$",
- "images": {},
- },
- {
- "content": "test 6 $\\frac{1}{2}$ $\\frac{3}{2}$",
- "output": "test 6 $$\\frac{1}{2}$$ $$\\frac{3}{2}$$",
- "images": {},
- },
- ]
- for test in tests:
- result = process_content(test, mixer.blend(AssessmentItem))
- self.assertEqual(result, test["output"])
-
- def test_generate_assessment_item(self):
- # Run in Spanish to ensure we are properly creating JSON with non-localized numbers
- activate("es-es")
- for assessment_id, data in list(ASSESSMENT_DATA.items()):
- assessment_data = json.loads(
- render_to_string(data["template"], data).encode("utf-8", "ignore")
- )
- assessment_item = generate_assessment_item(
- assessment_id, data["order"], data["type"], assessment_data
- )
- self.assertEqual(assessment_item.type, data["type"])
- self.assertEqual(assessment_item.question, data.get("question", ""))
- self.assertEqual(assessment_item.randomize, bool(data.get("randomize")))
- self.assertEqual(assessment_item.raw_data, data.get("raw_data", ""))
- for hint in json.loads(assessment_item.hints):
- self.assertTrue(
- any(h for h in data["hints"] if h["hint"] == hint["hint"])
- )
- for answer in json.loads(assessment_item.answers):
- self.assertTrue(
- any(
- a
- for a in data["answers"]
- if a["answer"] == str(answer["answer"])
- and a["correct"] == answer["correct"]
- )
- )
- deactivate()
diff --git a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py b/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
deleted file mode 100644
index 5d84fd9f10..0000000000
--- a/contentcuration/contentcuration/tests/utils/test_cloud_storage.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from django.test import TestCase
-
-from contentcuration.utils.cloud_storage import CloudStorage
-
-
-class CloudStorageTestCase(TestCase):
- def test_backend_initialization(self):
- cloud_storage_instance = CloudStorage()
- self.assertIsNotNone(cloud_storage_instance)
- self.assertIsInstance(cloud_storage_instance, CloudStorage)
diff --git a/contentcuration/contentcuration/tests/test_gcs_storage.py b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
similarity index 95%
rename from contentcuration/contentcuration/tests/test_gcs_storage.py
rename to contentcuration/contentcuration/tests/utils/test_gcs_storage.py
index a58420873e..9036641774 100755
--- a/contentcuration/contentcuration/tests/test_gcs_storage.py
+++ b/contentcuration/contentcuration/tests/utils/test_gcs_storage.py
@@ -8,8 +8,8 @@
from google.cloud.storage.blob import Blob
from mixer.main import mixer
-from contentcuration.utils.gcs_storage import CompositeGCS
-from contentcuration.utils.gcs_storage import GoogleCloudStorage
+from contentcuration.utils.storage.gcs import CompositeGCS
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
class GoogleCloudStorageSaveTestCase(TestCase):
@@ -74,9 +74,9 @@ def test_uploads_cache_control_private_if_content_database(self):
self.storage.save(filename, self.content, blob_object=self.blob_obj)
assert "private" in self.blob_obj.cache_control
- @mock.patch("contentcuration.utils.gcs_storage.BytesIO")
+ @mock.patch("contentcuration.utils.storage.gcs.BytesIO")
@mock.patch(
- "contentcuration.utils.gcs_storage.GoogleCloudStorage._is_file_empty",
+ "contentcuration.utils.storage.gcs.GoogleCloudStorage._is_file_empty",
return_value=False,
)
def test_gzip_if_content_database(self, bytesio_mock, file_empty_mock):
@@ -158,10 +158,10 @@ def setUp(self):
self.mock_anon_client.get_bucket.return_value = self.mock_anon_bucket
with mock.patch(
- "contentcuration.utils.gcs_storage._create_default_client",
+ "contentcuration.utils.storage.gcs._create_default_client",
return_value=self.mock_default_client,
), mock.patch(
- "contentcuration.utils.gcs_storage.Client.create_anonymous_client",
+ "contentcuration.utils.storage.gcs.Client.create_anonymous_client",
return_value=self.mock_anon_client,
):
self.storage = CompositeGCS()
@@ -192,7 +192,7 @@ def test_open(self):
self.assertIsInstance(f, File)
self.mock_default_bucket.get_blob.assert_called_with("blob")
- @mock.patch("contentcuration.utils.gcs_storage.Blob")
+ @mock.patch("contentcuration.utils.storage.gcs.Blob")
def test_save(self, mock_blob):
self.storage.save("blob", BytesIO(b"content"))
blob = mock_blob.return_value
diff --git a/contentcuration/contentcuration/tests/test_storage_common.py b/contentcuration/contentcuration/tests/utils/test_storage.py
similarity index 84%
rename from contentcuration/contentcuration/tests/test_storage_common.py
rename to contentcuration/contentcuration/tests/utils/test_storage.py
index f89534c194..b4c0e0db20 100644
--- a/contentcuration/contentcuration/tests/test_storage_common.py
+++ b/contentcuration/contentcuration/tests/utils/test_storage.py
@@ -7,17 +7,15 @@
import requests
from django.core.files.storage import FileSystemStorage
from django.test import TestCase
-from django_s3_storage.storage import S3Storage
from mock import MagicMock
-from .base import StudioTestCase
+from ..base import StudioTestCase
from contentcuration.models import generate_object_storage_name
-from contentcuration.utils.storage_common import _get_gcs_presigned_put_url
-from contentcuration.utils.storage_common import determine_content_type
-from contentcuration.utils.storage_common import get_presigned_upload_url
-from contentcuration.utils.storage_common import UnknownStorageBackendError
-
-# The modules we'll test
+from contentcuration.utils.storage.common import determine_content_type
+from contentcuration.utils.storage.common import get_presigned_upload_url
+from contentcuration.utils.storage.common import UnknownStorageBackendError
+from contentcuration.utils.storage.dev import Storage as DevStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
class MimeTypesTestCase(TestCase):
@@ -81,7 +79,6 @@ def test_raises_error(self):
"nice",
"err",
5,
- 0,
storage=self.STORAGE,
)
@@ -95,7 +92,9 @@ class GoogleCloudStoragePresignedURLUnitTestCase(TestCase):
"""
def setUp(self):
+ super().setUp()
self.client = MagicMock()
+ self.storage = GoogleCloudStorage(self.client, "fake")
self.generate_signed_url_method = (
self.client.get_bucket.return_value.blob.return_value.generate_signed_url
)
@@ -107,19 +106,15 @@ def test_that_generate_signed_url_is_called(self):
"""
Check that we even call blob.generate_signed_url in the first place.
"""
- bucket = "fake"
- _get_gcs_presigned_put_url(self.client, bucket, "/object.jpg", "aBc", 0, 0)
+ get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
self.generate_signed_url_method.assert_called_once()
def test_that_we_return_a_string(self):
"""
Check that _get_gcs_presigned_put_url returns a string.
"""
- bucket = "fake"
- ret = _get_gcs_presigned_put_url(
- self.client, bucket, "/object.jpg", "aBc", 0, 0
- )
- assert isinstance(ret, str)
+ ret = get_presigned_upload_url("/object.jpg", "aBc", 0, storage=self.storage)
+ assert isinstance(ret["uploadURL"], str)
def test_generate_signed_url_called_with_required_arguments(self):
"""
@@ -137,11 +132,9 @@ def test_generate_signed_url_called_with_required_arguments(self):
bucket_name = "fake"
filepath = "object.jpg"
lifetime = 20 # seconds
- mimetype = "doesntmatter"
+ mimetype = "image/jpeg"
- _get_gcs_presigned_put_url(
- self.client, bucket_name, filepath, content_md5, lifetime, mimetype
- )
+ get_presigned_upload_url(filepath, content_md5, lifetime, storage=self.storage)
# assert that we're creating the right object
self.client.get_bucket.assert_called_once_with(bucket_name)
@@ -153,8 +146,8 @@ def test_generate_signed_url_called_with_required_arguments(self):
self.generate_signed_url_method.assert_called_once_with(
method=method,
content_md5=content_md5,
- expiration=lifetime_timedelta,
content_type=mimetype,
+ expiration=lifetime_timedelta,
)
@@ -163,11 +156,9 @@ class S3StoragePresignedURLUnitTestCase(StudioTestCase):
Test cases for generating presigned URLs for S3 storage, i.e. Minio.
"""
- STORAGE = S3Storage()
-
def setUp(self):
- self.client = MagicMock()
super().setUp()
+ self.storage = DevStorage()
def test_returns_string_if_inputs_are_valid(self):
"""
@@ -176,9 +167,7 @@ def test_returns_string_if_inputs_are_valid(self):
"""
# use a real connection here as a sanity check
- ret = get_presigned_upload_url(
- "a/b/abc.jpg", "aBc", 10, 1, storage=self.STORAGE, client=None
- )
+ ret = get_presigned_upload_url("a/b/abc.jpg", "aBc", 10, storage=self.storage)
url = ret["uploadURL"]
assert isinstance(url, str)
@@ -199,9 +188,7 @@ def test_can_upload_file_to_presigned_url(self):
filename = "blahfile.jpg"
filepath = generate_object_storage_name(md5_checksum, filename)
- ret = get_presigned_upload_url(
- filepath, md5_checksum_base64, 1000, len(file_contents)
- )
+ ret = get_presigned_upload_url(filepath, md5_checksum_base64, 1000)
url = ret["uploadURL"]
content_type = ret["mimetype"]
diff --git a/contentcuration/contentcuration/utils/cloud_storage.py b/contentcuration/contentcuration/utils/cloud_storage.py
deleted file mode 100644
index a331226905..0000000000
--- a/contentcuration/contentcuration/utils/cloud_storage.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from automation.utils.appnexus.base import Backend
-from automation.utils.appnexus.base import BackendFactory
-from automation.utils.appnexus.base import BackendRequest
-from automation.utils.appnexus.base import BackendResponse
-
-
-class CloudStorageBackendRequest(BackendRequest):
- pass
-
-
-class CloudStorageRequest(CloudStorageBackendRequest):
- def __init__(self) -> None:
- super().__init__()
-
-
-class CloudStorageBackendResponse(BackendResponse):
- pass
-
-
-class CloudStorageResponse(CloudStorageBackendResponse):
- def __init__(self) -> None:
- pass
-
-
-class CloudStorageBackendFactory(BackendFactory):
- def create_backend(self) -> Backend:
- return super().create_backend()
-
-
-class CloudStorage(Backend):
- def connect(self) -> None:
- return super().connect()
-
- def make_request(self, request) -> CloudStorageResponse:
- return super().make_request(request)
-
- @classmethod
- def _create_instance(cls) -> "CloudStorage":
- return cls()
diff --git a/contentcuration/contentcuration/utils/files.py b/contentcuration/contentcuration/utils/files.py
index 0cb447a601..18f21dd702 100644
--- a/contentcuration/contentcuration/utils/files.py
+++ b/contentcuration/contentcuration/utils/files.py
@@ -85,12 +85,13 @@ def duplicate_file(
return file_copy
-def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
+def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH, input_buffer=None):
"""
Generates a base64 encoding for a thumbnail
Args:
filename (str): thumbnail to generate encoding from (must be in storage already)
dimension (int, optional): desired width of thumbnail. Defaults to 400.
+ input_buffer (BytesIO, optional): buffer to read from. Defaults to None.
Returns base64 encoding of resized thumbnail
"""
@@ -103,23 +104,23 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
# make sure the aspect ratio between width and height is 16:9
thumbnail_size = [dimension, round(dimension / 1.77)]
try:
- if not filename.startswith(settings.STATIC_ROOT):
- filename = generate_object_storage_name(checksum, filename)
- inbuffer = default_storage.open(filename, "rb")
-
- else:
- # Normalize the path and ensure it is indeed within STATIC_ROOT
- normalized_path = os.path.normpath(filename)
- static_root = os.path.abspath(settings.STATIC_ROOT)
- abs_path = os.path.abspath(normalized_path)
- if not abs_path.startswith(static_root + os.sep):
- raise ValueError("Attempted access to file outside of STATIC_ROOT")
- inbuffer = open(abs_path, "rb")
-
- if not inbuffer:
+ if not input_buffer:
+ if not filename.startswith(settings.STATIC_ROOT):
+ filename = generate_object_storage_name(checksum, filename)
+ input_buffer = default_storage.open(filename, "rb")
+ else:
+ # Normalize the path and ensure it is indeed within STATIC_ROOT
+ normalized_path = os.path.normpath(filename)
+ static_root = os.path.abspath(settings.STATIC_ROOT)
+ abs_path = os.path.abspath(normalized_path)
+ if not abs_path.startswith(static_root + os.sep):
+ raise ValueError("Attempted access to file outside of STATIC_ROOT")
+ input_buffer = open(filename, "rb")
+
+ if not input_buffer:
raise AssertionError
- with Image.open(inbuffer) as image:
+ with Image.open(input_buffer) as image:
image_format = image.format
# Note: Image.thumbnail ensures that the image will fit in the
@@ -136,7 +137,7 @@ def get_thumbnail_encoding(filename, dimension=THUMBNAIL_WIDTH):
finally:
# Try to close the inbuffer if it has been created
try:
- inbuffer.close()
+ input_buffer.close()
except UnboundLocalError:
pass
outbuffer.close()
diff --git a/contentcuration/contentcuration/utils/import_tools.py b/contentcuration/contentcuration/utils/import_tools.py
index 0a187ce4c9..bc6e58baa1 100644
--- a/contentcuration/contentcuration/utils/import_tools.py
+++ b/contentcuration/contentcuration/utils/import_tools.py
@@ -1,30 +1,39 @@
# -*- coding: utf-8 -*-
import datetime
+import hashlib
import json
import logging
import os
import re
-import shutil
import sqlite3
import sys
import tempfile
-import zipfile
+from functools import cached_property
from io import BytesIO
import requests
-from django.conf import settings
+import tqdm
from django.core.files.storage import default_storage
+from django.core.management import call_command
from django.db import transaction
+from kolibri_content.router import get_active_content_database
+from kolibri_content.router import using_content_database
+from le_utils.constants import completion_criteria
from le_utils.constants import content_kinds
from le_utils.constants import exercises
from le_utils.constants import format_presets
+from le_utils.constants import mastery_criteria
from le_utils.constants import roles
+from le_utils.constants.labels import learning_activities
from contentcuration import models
-from contentcuration.api import write_raw_content_to_storage
from contentcuration.utils.files import create_file_from_contents
+from contentcuration.utils.files import get_thumbnail_encoding
from contentcuration.utils.files import write_base64_to_file
from contentcuration.utils.garbage_collect import get_deleted_chefs_root
+from contentcuration.utils.publish import publish_channel
+from contentcuration.utils.storage.base import CompositeStorage
+from contentcuration.viewsets.assessmentitem import exercise_image_filename_regex
CHANNEL_TABLE = "content_channelmetadata"
@@ -47,126 +56,38 @@
log = logging.getLogger(__name__)
-def import_channel(
- source_id, target_id=None, download_url=None, editor=None, logger=None
-):
- """
- Import a channel from another Studio instance. This can be used to
- copy online Studio channels into local machines for development,
- testing, faster editing, or other purposes.
-
- :param source_id: The UUID of the channel to import from the source Studio instance.
- :param target_id: The UUID of the channel on the local instance. Defaults to source_id.
- :param download_url: The URL of the Studio instance to import from.
- :param editor: The email address of the user you wish to add as an editor, if any.
+class ImportClient(requests.Session):
+ def __init__(self, base_url, api_token=None):
+ super(ImportClient, self).__init__()
+ self.base_url = base_url
+ self.api_token = api_token
+ self.headers.update(
+ {
+ "User-Agent": f"restore_channel/kolibri-studio/dev python-requests/{requests.__version__}",
+ }
+ )
- """
+ def __getattr__(self, name):
+ if name.endswith("_with_token"):
+ if not self.api_token:
+ raise ValueError("API token is required for this method.")
- global log
- if logger:
- log = logger
- else:
- log = logging.getLogger(__name__)
-
- # Set up variables for the import process
- log.info("\n\n********** STARTING CHANNEL IMPORT **********")
- start = datetime.datetime.now()
- target_id = target_id or source_id
-
- # Test connection to database
- log.info("Connecting to database for channel {}...".format(source_id))
-
- tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
- conn = None
- try:
- if download_url:
- response = requests.get(
- "{}/content/databases/{}.sqlite3".format(download_url, source_id)
+ target_method = getattr(
+ super(ImportClient, self), name.replace("_with_token", "")
)
- for chunk in response:
- tempf.write(chunk)
- else:
- filepath = "/".join([settings.DB_ROOT, "{}.sqlite3".format(source_id)])
- # Check if database exists
- if not default_storage.exists(filepath):
- raise IOError("The object requested does not exist.")
- with default_storage.open(filepath) as fobj:
- shutil.copyfileobj(fobj, tempf)
-
- tempf.close()
- conn = sqlite3.connect(tempf.name)
- cursor = conn.cursor()
-
- # Start by creating channel
- log.info("Creating channel...")
- editor = models.User.objects.get(email=editor)
- channel, root_pk = create_channel(conn, target_id, editor)
- channel.editors.add(editor)
- channel.save()
-
- # Create root node
- root = models.ContentNode.objects.create(
- node_id=root_pk,
- title=channel.name,
- kind_id=content_kinds.TOPIC,
- original_channel_id=target_id,
- source_channel_id=target_id,
- )
-
- # Create nodes mapping to channel
- log.info(" Creating nodes...")
- with transaction.atomic():
- create_nodes(cursor, target_id, root, download_url=download_url)
- # TODO: Handle prerequisites
-
- # Delete the previous tree if it exists
- old_previous = channel.previous_tree
- if old_previous:
- old_previous.parent = get_deleted_chefs_root()
- old_previous.title = "Old previous tree for channel {}".format(channel.pk)
- old_previous.save()
-
- # Save tree to target tree
- channel.previous_tree = channel.main_tree
- channel.main_tree = root
- channel.save()
- finally:
- conn and conn.close()
- tempf.close()
- os.unlink(tempf.name)
-
- # Print stats
- log.info(
- "\n\nChannel has been imported (time: {ms})\n".format(
- ms=datetime.datetime.now() - start
+ token_headers = {
+ "Authorization": f"Token {self.api_token}",
+ }
+ return lambda url, *args, **kwargs: target_method(
+ url, *args, headers=token_headers, **kwargs
+ )
+ raise AttributeError(
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
)
- )
- log.info("\n\n********** IMPORT COMPLETE **********\n\n")
-
-def create_channel(cursor, target_id, editor):
- """create_channel: Create channel at target id
- Args:
- cursor (sqlite3.Connection): connection to export database
- target_id (str): channel_id to write to
- Returns: channel model created and id of root node
- """
- id, name, description, thumbnail, root_pk, version, last_updated = cursor.execute(
- "SELECT id, name, description, thumbnail, root_pk, version, last_updated FROM {table}".format(
- table=CHANNEL_TABLE
- )
- ).fetchone()
- channel, is_new = models.Channel.objects.get_or_create(
- pk=target_id, actor_id=editor.id
- )
- channel.name = name
- channel.description = description
- channel.thumbnail = write_to_thumbnail_file(thumbnail)
- channel.thumbnail_encoding = {"base64": thumbnail, "points": [], "zoom": 0}
- channel.version = version
- channel.save()
- log.info("\tCreated channel {} with name {}".format(target_id, name))
- return channel, root_pk
+ def request(self, method, url, *args, **kwargs):
+ url = f"{self.base_url}{url}"
+ return super(ImportClient, self).request(method, url, *args, **kwargs)
def write_to_thumbnail_file(raw_thumbnail):
@@ -195,446 +116,689 @@ def write_to_thumbnail_file(raw_thumbnail):
os.unlink(tempf.name)
-def create_nodes(cursor, target_id, parent, indent=1, download_url=None):
- """create_channel: Create channel at target id
- Args:
- cursor (sqlite3.Connection): connection to export database
- target_id (str): channel_id to write to
- parent (models.ContentNode): node's parent
- indent (int): How far to indent print statements
- Returns: newly created node
+def convert_metadata_to_dict(metadata):
"""
- # Read database rows that match parent
- parent_query = "parent_id='{}'".format(parent.node_id)
-
- sql_command = (
- "SELECT id, title, content_id, description, sort_order, "
- "license_owner, author, license_id, kind, coach_content, lang_id FROM {table} WHERE {query} ORDER BY sort_order;".format(
- table=NODE_TABLE, query=parent_query
- )
- )
- query = cursor.execute(sql_command).fetchall()
-
- # Parse through rows and create models
- for (
- id,
- title,
- content_id,
- description,
- sort_order,
- license_owner,
- author,
- license_id,
- kind,
- coach_content,
- lang_id,
- ) in query:
- log.info(
- "{indent} {id} ({title} - {kind})...".format(
- indent=" |" * indent, id=id, title=title, kind=kind
- )
- )
-
- # Determine role
- role = roles.LEARNER
- if coach_content:
- role = roles.COACH
+ Convert metadata from a string to a dictionary.
- # Determine extra_fields
- assessment_query = "SELECT mastery_model, randomize FROM {table} WHERE contentnode_id='{node}'".format(
- table=ASSESSMENTMETADATA_TABLE, node=id
- )
- result = cursor.execute(assessment_query).fetchone()
- extra_fields = result[0] if result else {}
- if isinstance(extra_fields, str):
- extra_fields = json.loads(extra_fields)
- if result:
- extra_fields.update({"randomize": result[1]})
-
- # Determine license
- license = retrieve_license(cursor, license_id)
- license_description = license[1] if license else ""
- license = license[0] if license else None
-
- # TODO: Determine thumbnail encoding
-
- # Create new node model
- node = models.ContentNode.objects.create(
- node_id=id,
- original_source_node_id=id,
- source_node_id=id,
- title=title,
- content_id=content_id,
- description=description,
- sort_order=sort_order,
- copyright_holder=license_owner,
- author=author,
- license=license,
- license_description=license_description,
- language_id=lang_id,
- role_visibility=role,
- extra_fields=extra_fields,
- kind_id=kind,
- parent=parent,
- original_channel_id=target_id,
- source_channel_id=target_id,
- )
-
- # Handle foreign key references (children, files, tags)
- if kind == content_kinds.TOPIC:
- create_nodes(
- cursor, target_id, node, indent=indent + 1, download_url=download_url
- )
- elif kind == content_kinds.EXERCISE:
- create_assessment_items(
- cursor, node, indent=indent + 1, download_url=download_url
- )
- create_files(cursor, node, indent=indent + 1, download_url=download_url)
- create_tags(cursor, node, target_id, indent=indent + 1)
+ :param metadata: The metadata string to convert.
+ :return: A dictionary representation of the metadata.
+ """
+ if isinstance(metadata, str):
+ metadata_split = metadata.split(",")
+ return {metadata_key: True for metadata_key in metadata_split}
+ return metadata
- return node
+def convert_learning_activities_to_dict(content_kind, metadata):
+ """
+ Convert learning activities from a string to a dictionary.
-def retrieve_license(cursor, license_id):
- """retrieve_license_name: Get license based on id from exported db
- Args:
- cursor (sqlite3.Connection): connection to export database
- license_id (str): id of license on exported db
- Returns: license model matching the name and the associated license description
+ :param content_kind: The content kind of the learning activities.
+ :param metadata: The learning activities string to convert.
+ :return: A dictionary representation of the learning activities.
"""
- # Handle no license being assigned
- if license_id is None or license_id == "":
+ metadata = convert_metadata_to_dict(metadata)
+ if isinstance(metadata, dict):
+ return metadata
+
+ if content_kind == content_kinds.EXERCISE:
+ return {learning_activities.PRACTICE: True}
+ elif content_kind in [content_kinds.HTML5, content_kinds.H5P]:
+ return {learning_activities.EXPLORE: True}
+ elif content_kind == content_kinds.AUDIO:
+ return {learning_activities.LISTEN: True}
+ elif content_kind == content_kinds.VIDEO:
+ return {learning_activities.WATCH: True}
+ elif content_kind == content_kinds.DOCUMENT:
+ return {learning_activities.READ: True}
+ elif content_kind == content_kinds.SLIDESHOW:
+ return {learning_activities.READ: True}
+ elif content_kind == content_kinds.TOPIC:
return None
+ return {learning_activities.EXPLORE: True}
- # Return license that matches name
- name, description = cursor.execute(
- "SELECT license_name, license_description FROM {table} WHERE id={id}".format(
- table=LICENSE_TABLE, id=license_id
- )
- ).fetchone()
- return models.License.objects.get(license_name=name), description
-
-
-def download_file(
- filename,
- download_url=None,
- contentnode=None,
- assessment_item=None,
- preset=None,
- file_size=None,
- lang_id=None,
-):
- checksum, extension = os.path.splitext(filename)
- extension = extension.lstrip(".")
- filepath = models.generate_object_storage_name(checksum, filename)
-
- # Download file if it hasn't already been downloaded
- if download_url and not default_storage.exists(filepath):
- buffer = BytesIO()
- response = requests.get(
- "{}/content/storage/{}/{}/{}".format(
- download_url, filename[0], filename[1], filename
- )
- )
- for chunk in response:
- buffer.write(chunk)
- checksum, _, filepath = write_raw_content_to_storage(
- buffer.getvalue(), ext=extension
- )
- buffer.close()
-
- # Save values to new file object
- file_obj = models.File(
- file_format_id=extension,
- file_size=file_size or default_storage.size(filepath),
- contentnode=contentnode,
- assessment_item=assessment_item,
- language_id=lang_id,
- preset_id=preset or "",
- )
- file_obj.file_on_disk.name = filepath
- file_obj.save()
-
-
-def create_files(cursor, contentnode, indent=0, download_url=None):
- """create_files: Get license
- Args:
- cursor (sqlite3.Connection): connection to export database
- contentnode (models.ContentNode): node file references
- indent (int): How far to indent print statements
- Returns: None
+class ImportManager(object):
+ """
+ Import a channel from another Studio instance. This can be used to copy online Studio channels
+ into local machines for development, testing, faster editing, or other purposes.
"""
- # Parse database for files referencing content node and make file models
- sql_command = (
- "SELECT checksum, extension, file_size, contentnode_id, "
- "lang_id, preset FROM {table} WHERE contentnode_id='{id}';".format(
- table=FILE_TABLE, id=contentnode.node_id
- )
- )
-
- query = cursor.execute(sql_command).fetchall()
- for checksum, extension, file_size, contentnode_id, lang_id, preset in query:
- filename = "{}.{}".format(checksum, extension)
- log.info(
- "{indent} * FILE {filename}...".format(
- indent=" |" * indent, filename=filename
- )
- )
-
- try:
- download_file(
- filename,
- download_url=download_url,
- contentnode=contentnode,
- preset=preset,
- file_size=file_size,
- lang_id=lang_id,
- )
- except IOError as e:
- log.warning("\b FAILED (check logs for more details)")
- sys.stderr.write(
- "Restoration Process Error: Failed to save file object {}: {}".format(
- filename, os.strerror(e.errno)
- )
+ def __init__(
+ self,
+ source_url,
+ source_id,
+ target_id=None,
+ editor=None,
+ public=False,
+ publish=False,
+ token=None,
+ download_content=True,
+ logger=None,
+ ):
+ self.source_id = source_id
+ self.target_id = target_id or source_id
+ self.source_url = source_url
+ self.editor = editor
+ self.public = public
+ self.publish = publish
+ self.token = token
+ self.download_content = download_content
+ self.logger = logger or logging.getLogger(__name__)
+ self.client = ImportClient(source_url, api_token=token)
+ self.storage = (
+ default_storage._get_writeable_backend()
+ if isinstance(default_storage, CompositeStorage)
+ else default_storage
+ )
+ self.conn = None
+ self.cursor = None
+ self.progress = None
+
+ @cached_property
+ def editor_user(self):
+ """
+ Get the User object for the editor email address.
+
+ :return: The User object for the editor.
+ """
+ return models.User.objects.get(email=self.editor) if self.editor else None
+
+ def run(self):
+ """
+ Run the import restoration process.
+ """
+ self.logger.info("********** STARTING CHANNEL RESTORATION **********")
+ # Set up variables for the import process
+ start = datetime.datetime.now()
+
+ if not self.token:
+ self.logger.warning(
+ "No API token provided. This may result in limited functionality."
)
- continue
+ # Test connection to the database
+ self.logger.info(f"Connecting to database for channel {self.source_id}...")
-def create_tags(cursor, contentnode, target_id, indent=0):
- """create_tags: Create tags associated with node
- Args:
- cursor (sqlite3.Connection): connection to export database
- contentnode (models.ContentNode): node file references
- target_id (str): channel_id to write to
- indent (int): How far to indent print statements
- Returns: None
- """
- # Parse database for files referencing content node and make file models
- sql_command = (
- "SELECT ct.id, ct.tag_name FROM {cnttable} cnt "
- "JOIN {cttable} ct ON cnt.contenttag_id = ct.id "
- "WHERE cnt.contentnode_id='{id}';".format(
- cnttable=NODE_TAG_TABLE,
- cttable=TAG_TABLE,
- id=contentnode.node_id,
- )
- )
- query = cursor.execute(sql_command).fetchall()
-
- # Build up list of tags
- tag_list = []
- for id, tag_name in query:
- log.info(
- "{indent} ** TAG {tag}...".format(indent=" |" * indent, tag=tag_name)
- )
- # Save values to new or existing tag object
- tag_obj, is_new = models.ContentTag.objects.get_or_create(
- pk=id,
- tag_name=tag_name,
- channel_id=target_id,
- )
- tag_list.append(tag_obj)
-
- # Save tags to node
- contentnode.tags.set(tag_list)
- contentnode.save()
+ tempf = tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False)
+ try:
+ response = self.client.get(f"/content/databases/{self.source_id}.sqlite3")
+ for chunk in response:
+ tempf.write(chunk)
+ tempf.close()
-def create_assessment_items(cursor, contentnode, indent=0, download_url=None):
- """create_assessment_items: Generate assessment items based on perseus zip
- Args:
- cursor (sqlite3.Connection): connection to export database
- contentnode (models.ContentNode): node assessment items reference
- indent (int): How far to indent print statements
- download_url (str): Domain to download files from
- Returns: None
- """
+ with using_content_database(tempf.name):
+ call_command(
+ "migrate",
+ "content",
+ database=get_active_content_database(),
+ no_input=True,
+ )
- # Parse database for files referencing content node and make file models
- sql_command = (
- "SELECT checksum, extension "
- "preset FROM {table} WHERE contentnode_id='{id}' AND preset='exercise';".format(
- table=FILE_TABLE, id=contentnode.node_id
- )
- )
-
- query = cursor.execute(sql_command).fetchall()
- for checksum, extension in query:
- filename = "{}.{}".format(checksum, extension)
- log.info(
- "{indent} * EXERCISE {filename}...".format(
- indent=" |" * indent, filename=filename
+ self.conn = sqlite3.connect(tempf.name)
+ self.cursor = self.conn.cursor()
+
+ # Start by creating the channel
+ self.logger.info("Creating channel...")
+ channel, root_pk = self._create_channel()
+ channel.editors.add(self.editor_user)
+ channel.save()
+
+ # Create the root node
+ root = models.ContentNode.objects.create(
+ node_id=root_pk,
+ title=channel.name,
+ kind_id=content_kinds.TOPIC,
+ original_channel_id=self.target_id,
+ source_channel_id=self.target_id,
+ complete=True,
)
- )
- try:
- # Store the downloaded zip into temporary storage
- tempf = tempfile.NamedTemporaryFile(
- suffix=".{}".format(extension), delete=False
+ self.logger.info("Creating nodes...")
+ total_nodes = self.cursor.execute(
+ f"SELECT COUNT(*) FROM {NODE_TABLE}"
+ ).fetchone()[0]
+ node_progress = tqdm.tqdm(
+ total=total_nodes, desc="Restoring nodes", unit="node"
)
- response = requests.get(
- "{}/content/storage/{}/{}/{}".format(
- download_url, filename[0], filename[1], filename
+
+ # Create nodes mapping to channel
+ with transaction.atomic():
+ self._create_nodes(root, node_progress)
+ node_progress.close()
+ self.logger.info("Creating assessment items...")
+ exercise_nodes = models.ContentNode.objects.filter(
+ kind_id=content_kinds.EXERCISE, tree_id=root.tree_id
)
- )
- for chunk in response:
- tempf.write(chunk)
- tempf.close()
- extract_assessment_items(tempf.name, contentnode, download_url=download_url)
- except IOError as e:
- log.warning("\b FAILED (check logs for more details)")
- sys.stderr.write(
- "Restoration Process Error: Failed to save file object {}: {}".format(
- filename, os.strerror(e.errno)
+ exercise_progress = tqdm.tqdm(
+ total=exercise_nodes.count(),
+ desc="Restoring assessments",
+ unit="node",
)
- )
- continue
+ chunk = []
+ for node in exercise_nodes.iterator(chunk_size=20):
+ chunk.append(node)
+ if len(chunk) >= 20:
+ self._create_assessment_items(chunk)
+ exercise_progress.update(len(chunk))
+ chunk = []
+ if chunk:
+ self._create_assessment_items(chunk)
+ exercise_progress.update(len(chunk))
+ exercise_progress.close()
+ # TODO: Handle prerequisites
+
+ # Delete the previous tree if it exists
+ old_previous = channel.previous_tree
+ if old_previous:
+ old_previous.parent = get_deleted_chefs_root()
+ old_previous.title = f"Old previous tree for channel {channel.pk}"
+ old_previous.save()
+
+ # Save the new tree to the target tree, and preserve the old one
+ channel.previous_tree = channel.main_tree
+ channel.main_tree = root
+ channel.save()
finally:
+ self.conn and self.conn.close()
+ tempf.close()
os.unlink(tempf.name)
+ # Publish the channel if requested
+ if self.publish:
+ self.logger.info("Publishing channel...")
+ publish_channel(self.editor_user.id, channel.id)
-def extract_assessment_items(filepath, contentnode, download_url=None):
- """extract_assessment_items: Create and save assessment items to content node
- Args:
- filepath (str): Where perseus zip is stored
- contentnode (models.ContentNode): node assessment items reference
- download_url (str): Domain to download files from
- Returns: None
- """
+ # Print stats
+ self.logger.info(
+ f"Channel has been imported (time: {datetime.datetime.now() - start})"
+ )
+ self.logger.info("********** IMPORT COMPLETE **********")
+
+ def _create_channel(self):
+ """
+ Create the channel at target id
+ """
+ (
+ id,
+ name,
+ description,
+ thumbnail,
+ root_pk,
+ version,
+ last_updated,
+ schema_version,
+ ) = self.cursor.execute(
+ f"""
+ SELECT
+ id, name, description, thumbnail, root_pk, version, last_updated,
+ min_schema_version
+ FROM {CHANNEL_TABLE}
+ """
+ ).fetchone()
+ lang_id, _ = self.cursor.execute(
+ f"""
+ SELECT lang_id, COUNT(id) AS node_by_lang_count
+ FROM {NODE_TABLE}
+ ORDER BY node_by_lang_count DESC
+ """
+ ).fetchone()
+ channel, is_new = models.Channel.objects.get_or_create(
+ pk=self.target_id, actor_id=self.editor_user.id
+ )
+ channel.name = name
+ channel.description = description
+ channel.language_id = lang_id
+ channel.thumbnail = write_to_thumbnail_file(thumbnail)
+ channel.thumbnail_encoding = {"base64": thumbnail, "points": [], "zoom": 0}
+ channel.version = version
+ channel.public = self.public
+ channel.save()
+ self.logger.info(f"Created channel {self.target_id} with name {name}")
+ return channel, root_pk
+
+ def _create_nodes(self, parent, progress):
+ """
+ Create node(s) for a channel with target id
+
+ :param parent: node's parent
+ :param progress: progress bar for node creation
+ """
+ sql_command = f"""
+ SELECT
+ id, title, content_id, description, sort_order, license_owner, author, license_id,
+ kind, coach_content, lang_id, grade_levels, resource_types, learning_activities,
+ accessibility_labels, categories, learner_needs, duration, options
+ FROM {NODE_TABLE}
+ WHERE parent_id = ?
+ ORDER BY sort_order;
+ """
+ query = self.cursor.execute(
+ sql_command, (getattr(parent, "node_id", parent),)
+ ).fetchall()
+
+ # Parse through rows and create models
+ for (
+ id,
+ title,
+ content_id,
+ description,
+ sort_order,
+ license_owner,
+ author,
+ license_id,
+ kind,
+ coach_content,
+ lang_id,
+ grade_levels,
+ resource_types,
+ learning_activities_,
+ accessibility_labels,
+ categories,
+ learner_needs,
+ duration,
+ options,
+ ) in query:
+ # Determine role
+ role = roles.LEARNER
+ if coach_content:
+ role = roles.COACH
+
+ # Determine license
+ license_result = self._retrieve_license(license_id)
+ license_description = license_result[1] if license_result else ""
+ license_result = license_result[0] if license_result else None
+
+ # Create the new node model
+ node = models.ContentNode.objects.create(
+ node_id=id,
+ original_source_node_id=id,
+ source_node_id=id,
+ title=title,
+ content_id=content_id,
+ description=description,
+ sort_order=sort_order,
+ copyright_holder=license_owner,
+ author=author,
+ license=license_result,
+ license_description=license_description,
+ language_id=lang_id,
+ role_visibility=role,
+ extra_fields=self._prepare_node_extra_fields(id, kind, options),
+ kind_id=kind,
+ parent=parent,
+ original_channel_id=self.target_id,
+ source_channel_id=self.target_id,
+ grade_levels=convert_metadata_to_dict(grade_levels),
+ resource_types=convert_metadata_to_dict(resource_types),
+ learning_activities=convert_learning_activities_to_dict(
+ kind, learning_activities_
+ ),
+ accessibility_labels=convert_metadata_to_dict(accessibility_labels),
+ categories=convert_metadata_to_dict(categories),
+ learner_needs=convert_metadata_to_dict(learner_needs),
+ )
- try:
- tempdir = tempfile.mkdtemp()
- with zipfile.ZipFile(filepath, "r") as zipf:
- zipf.extractall(tempdir)
- os.chdir(tempdir)
-
- with open("exercise.json", "rb") as fobj:
- data = json.load(fobj)
-
- for index, assessment_id in enumerate(data["all_assessment_items"]):
- with open("{}.json".format(assessment_id), "rb") as fobj:
- assessment_item = generate_assessment_item(
- assessment_id,
- index,
- data["assessment_mapping"][assessment_id],
- json.load(fobj),
- download_url=download_url,
+ # Handle foreign key references (children, files, tags)
+ if kind == content_kinds.TOPIC:
+ self._create_nodes(node, progress)
+ self._create_files(node)
+ self._create_tags(node)
+
+ # assessments are handled after all nodes are created, which also ensures nodes
+ # are marked complete
+ if kind != content_kinds.EXERCISE:
+ errors = node.mark_complete()
+ if errors:
+ self.logger.warning(f"Node {node.node_id} has errors: {errors}")
+ node.save()
+ progress.update(1)
+
+ def _prepare_node_extra_fields(self, node_id, kind, options):
+ """
+ Prepare extra fields for the node based on the kind and options. For exercises, it
+ retrieves the additional info from the assessment metadata.
+
+ :param node_id: the node ID
+ :param kind: the content kind
+ :param options: the options JSON string
+ :return: a dictionary of extra fields
+ """
+ extra_fields = {
+ "options": json.loads(options) if options else {},
+ }
+ completion_criteria_ = extra_fields["options"].get("completion_criteria", {})
+
+ # don't fill anything in if there is no completion_criteria, otherwise validation will fail
+ if completion_criteria_ and "learner_managed" not in completion_criteria_:
+ completion_criteria_.update(learner_managed=False)
+
+ if kind == content_kinds.EXERCISE:
+ randomize_sql = f"""
+ SELECT randomize, mastery_model
+ FROM {ASSESSMENTMETADATA_TABLE}
+ WHERE contentnode_id = ?
+ """
+ randomize, mastery_criteria_ = self.cursor.execute(
+ randomize_sql, (node_id,)
+ ).fetchone()
+ extra_fields["randomize"] = bool(randomize) if randomize else False
+ if mastery_criteria_:
+ mastery_criteria_ = json.loads(mastery_criteria_)
+ mastery_criteria_.update(mastery_model=mastery_criteria_.pop("type"))
+ completion_criteria_.update(
+ {
+ "model": completion_criteria.MASTERY,
+ "threshold": mastery_criteria_,
+ }
)
- contentnode.assessment_items.add(assessment_item)
- finally:
- shutil.rmtree(tempdir)
+ if completion_criteria_.get("model") == completion_criteria.MASTERY:
+ mastery_model = completion_criteria_.get("threshold", {}).get(
+ "mastery_model"
+ )
+ if mastery_model in [
+ mastery_criteria.DO_ALL,
+ mastery_criteria.NUM_CORRECT_IN_A_ROW_2,
+ mastery_criteria.NUM_CORRECT_IN_A_ROW_3,
+ mastery_criteria.NUM_CORRECT_IN_A_ROW_5,
+ mastery_criteria.NUM_CORRECT_IN_A_ROW_10,
+ ]:
+ # remove m,n values
+ completion_criteria_["threshold"] = {
+ "mastery_model": mastery_model,
+ }
-def generate_assessment_item(
- assessment_id, order, assessment_type, assessment_data, download_url=None
-):
- """generate_assessment_item: Generates a new assessment item
- Args:
- assessment_id (str): AssessmentItem.assessment_id value
- order (Number): AssessmentItem.order value
- assessment_type (str): AssessmentItem.type value
- assessment_data (dict): Extracted data from perseus file
- download_url (str): Domain to download files from
- Returns: models.AssessmentItem
- """
- assessment_item = models.AssessmentItem.objects.create(
- assessment_id=assessment_id, type=assessment_type, order=order
- )
- if assessment_type == exercises.PERSEUS_QUESTION:
- assessment_item.raw_data = json.dumps(assessment_data)
- else:
- # Parse questions
- assessment_data["question"]["content"] = "\n\n".join(
- assessment_data["question"]["content"].split("\n\n")[:-1]
- )
- assessment_item.question = process_content(
- assessment_data["question"], assessment_item, download_url=download_url
- )
+ extra_fields["options"].update(completion_criteria=completion_criteria_)
+ return extra_fields
+
+ def _retrieve_license(self, license_id):
+ """
+ Get license based on id from exported db
+
+ :param license_id: id of license on exported db
+ :return: license model matching the id and the associated license description
+ :rtype: tuple
+ """
+ # Handle no license being assigned
+ if license_id is None or license_id == "":
+ return None
+
+ # Return license that matches name
+ name, description = self.cursor.execute(
+ f"""
+ SELECT license_name, license_description
+ FROM {LICENSE_TABLE}
+ WHERE id = ?
+ """,
+ (license_id,),
+ ).fetchone()
+ return models.License.objects.get(license_name=name), description
+
+ def _create_files(self, contentnode):
+ """
+ Create and possibly download node files
+
+ :param contentnode: node file references
+ """
+ # Parse database for files referencing content node and make file models
+ sql_command = f"""
+ SELECT checksum, extension, file_size, contentnode_id, lang_id, preset, thumbnail
+ FROM {FILE_TABLE}
+ WHERE contentnode_id = ?;
+ """
+ query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+ for (
+ checksum,
+ extension,
+ file_size,
+ contentnode_id,
+ lang_id,
+ preset,
+ is_thumbnail,
+ ) in query:
+ filename = "{}.{}".format(checksum, extension)
- # Parse answers
- answer_data = assessment_data["question"]["widgets"][
- ANSWER_FIELD_MAP[assessment_type]
- ]["options"]
- if assessment_type == exercises.INPUT_QUESTION:
- assessment_item.answers = json.dumps(
- [
- {"answer": answer["value"], "correct": True}
- for answer in answer_data["answers"]
- ]
- )
- else:
- assessment_item.answers = json.dumps(
- [
+ try:
+ self._download_file(
+ filename,
+ contentnode=contentnode,
+ preset=preset,
+ file_size=file_size,
+ lang_id=lang_id,
+ is_thumbnail=is_thumbnail,
+ )
+ except IOError as e:
+ self.logger.warning(f"FAILED to download '{filename}': {str(e)}")
+ if e.errno:
+ sys.stderr.write(
+ f"Restoration Process Error: Failed to save file object {filename}: {os.strerror(e.errno)}"
+ )
+ continue
+
+ def _download_file(
+ self,
+ filename,
+ contentnode=None,
+ assessment_item=None,
+ preset=None,
+ file_size=None,
+ lang_id=None,
+ is_thumbnail=False,
+ ):
+ """
+ Create and possibly download a file from source instance and save to local storage
+
+ :param filename: the name of the file to download
+ :param contentnode: the associated content node
+ :param assessment_item: the associated assessment item
+ :param preset: the format preset for the file
+ :param file_size: the known size of the file
+ :param lang_id: the language ID of the file
+ :param is_thumbnail: whether the file is a thumbnail
+ """
+ checksum, extension = os.path.splitext(filename)
+ extension = extension.lstrip(".")
+ filepath = models.generate_object_storage_name(checksum, filename)
+
+ file_url = f"/content/storage/{filename[0]}/{filename[1]}/{filename}"
+ file_exists = False
+
+ # If the file already exists, get the size from the storage
+ if self.storage.exists(filepath):
+ file_size = file_size or self.storage.size(filepath)
+ file_exists = True
+ # if it needs downloading and if we were instructed to do so
+ elif self.download_content or (is_thumbnail and contentnode):
+ buffer = BytesIO()
+ response = self.client.get(file_url)
+ for chunk in response:
+ buffer.write(chunk)
+
+ if is_thumbnail and contentnode:
+ # If the file is a thumbnail, save it to the content node
+ contentnode.thumbnail_encoding = json.dumps(
{
- "answer": process_content(
- answer, assessment_item, download_url=download_url
- ),
- "correct": answer["correct"],
+ "base64": get_thumbnail_encoding(filename, input_buffer=buffer),
+ "points": [],
+ "zoom": 0,
}
- for answer in answer_data["choices"]
- ]
- )
- assessment_item.randomize = answer_data["randomize"]
+ )
+ else:
+ checksum = hashlib.md5()
+ checksum.update(buffer.getvalue())
+ hashed_filename = checksum.hexdigest()
+ full_filename = "{}.{}".format(hashed_filename, extension.lower())
+ filepath = models.generate_object_storage_name(
+ hashed_filename, full_filename
+ )
- # Parse hints
- assessment_item.hints = json.dumps(
+ self.storage.save(filepath, buffer)
+ buffer.close()
+ file_exists = True
+ # otherwise, if file size is not known, get it from the response headers
+ elif not file_size:
+ response = self.client.head(file_url)
+ file_size = int(response.headers.get("Content-Length", 0))
+
+ # Save values to a new file object
+ file_obj = models.File(
+ file_format_id=extension,
+ file_size=file_size,
+ contentnode=contentnode,
+ assessment_item=assessment_item,
+ language_id=lang_id,
+ preset_id=preset or "",
+ checksum=checksum,
+ )
+ file_obj.file_on_disk.name = filepath
+ # set_by_file_on_disk: skip unless the file has been downloaded
+ file_obj.save(set_by_file_on_disk=file_exists)
+
+ def _create_tags(self, contentnode):
+ """
+ Create tags associated with node
+
+ :param contentnode: node tags reference
+ """
+ # Parse database for files referencing content node and make file models
+ sql_command = f"""
+ SELECT ct.tag_name
+ FROM {NODE_TAG_TABLE} cnt
+ JOIN {TAG_TABLE} ct ON cnt.contenttag_id = ct.id
+ WHERE cnt.contentnode_id = ?;
+ """
+ query = self.cursor.execute(sql_command, (contentnode.node_id,)).fetchall()
+
+ models.ContentTag.objects.bulk_create(
[
- {
- "hint": process_content(
- hint, assessment_item, download_url=download_url
- )
- }
- for hint in assessment_data["hints"]
- ]
+ models.ContentTag(
+ tag_name=tag_name,
+ channel_id=self.target_id,
+ )
+ for tag_name in query
+ ],
+ ignore_conflicts=True,
)
- assessment_item.save()
- return assessment_item
+ # Save tags to node
+ contentnode.tags.set(
+ models.ContentTag.objects.filter(
+ tag_name__in=query, channel_id=self.target_id
+ )
+ )
+ contentnode.save()
+ def _create_assessment_items(self, nodes):
+ """
+ Generate assessment items based on API data
-def process_content(data, assessment_item, download_url=None):
- """process_content: Parses perseus text for special formatting (e.g. formulas, images)
- Args:
- data (dict): Perseus data to parse (e.g. parsing 'question' field)
- download_url (str): Domain to download files from
- assessment_item (models.AssessmentItem): assessment item to save images to
- Returns: models.AssessmentItem
- """
- data["content"] = data["content"].replace(
- " ", ""
- ) # Remove unrecognized non unicode characters
- # Process formulas
- for match in re.finditer(r"(\$[^\$☣]+\$)", data["content"]):
- data["content"] = data["content"].replace(
- match.group(0), "${}$".format(match.group(0))
+ :param nodes: nodes to lookup assessment items
+ """
+ # Note: there are several different IDs being used within this method
+ node_ids = [node.node_id for node in nodes]
+
+ if not self.token:
+ self.logger.warning(
+ f"Skipping assessment items for node(s) {','. join(node_ids)}"
+ )
+ return
+
+ # first obtain the remote nodes' IDs with the node ID and channel ID
+ node_channel_ids = f",{self.source_id},".join(node_ids)
+ nodes_response = self.client.get_with_token(
+ f"/api/contentnode?_node_id_channel_id___in={node_channel_ids},{self.source_id}"
)
+ if nodes_response.status_code != 200:
+ self.logger.warning(
+ f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
+ )
+ return
- # Process images
+ nodes_data = nodes_response.json()
+ remote_node_pks = [n["id"] for n in nodes_data] if nodes_data else None
- for match in re.finditer(
- r"!\[[^\]]*\]\((\$(\{☣ LOCALPATH\}\/images)\/([^\.]+\.[^\)]+))\)",
- data["content"],
- ):
- data["content"] = data["content"].replace(
- match.group(2), exercises.CONTENT_STORAGE_PLACEHOLDER
+ if not remote_node_pks:
+ self.logger.warning(
+ f"No content node found for node(s) {','. join(node_ids)}"
+ )
+ return
+
+ # Get the content node's assessment items
+ assessment_response = self.client.get_with_token(
+ f"/api/assessmentitem?contentnode__in={','.join(remote_node_pks)}"
)
- image_data = data["images"].get(match.group(1))
- if image_data and image_data.get("width"):
- data["content"] = data["content"].replace(
- match.group(3),
- "{} ={}x{}".format(
- match.group(3), image_data["width"], image_data["height"]
- ),
+ if assessment_response.status_code != 200:
+ self.logger.warning(
+ f"Failed to obtain assessment items for node(s) {','. join(node_ids)}"
)
+ return
- # Save files to db
- download_file(
- match.group(3),
- assessment_item=assessment_item,
- preset=format_presets.EXERCISE,
- download_url=download_url,
+ assessment_items = assessment_response.json()
+ if not assessment_items:
+ self.logger.warning(
+ f"No assessment items found for node(s) {','. join(node_ids)}"
+ )
+ return
+
+ remote_node_pk_map = (
+ {n["node_id"]: n["id"] for n in nodes_data} if nodes_data else {}
)
- return data["content"]
+ for local_node in nodes:
+ remote_contentnode_id = remote_node_pk_map.get(local_node.node_id)
+ reduced_assessment_items = [
+ item
+ for item in assessment_items
+ if item["contentnode"] == remote_contentnode_id
+ ]
+
+ if not reduced_assessment_items:
+ self.logger.warning(
+ f"No assessment items found for node {local_node.node_id}"
+ )
+ continue
+
+ for item in reduced_assessment_items:
+ assessment_item = models.AssessmentItem.objects.create(
+ assessment_id=item["assessment_id"],
+ type=item["type"],
+ order=item["order"],
+ question=item["question"],
+ answers=item["answers"],
+ hints=item["hints"],
+ raw_data=item["raw_data"],
+ source_url=item["source_url"],
+ randomize=item.get("randomize", False),
+ )
+ self._process_assessment_images(assessment_item)
+ local_node.assessment_items.add(assessment_item)
+ errors = local_node.mark_complete()
+ if errors:
+ self.logger.warning(f"Node {local_node.node_id} has errors: {errors}")
+ local_node.save()
+
+ def _process_assessment_images(self, assessment_item):
+ """
+ Process images in assessment items and save them to the database.
+
+ :param assessment_item: The assessment item to process.
+ """
+ if not self.download_content:
+ # Skip if not downloading content
+ return
+
+ for content in [
+ assessment_item.question,
+ assessment_item.answers,
+ assessment_item.hints,
+ ]:
+ for match in re.finditer(exercise_image_filename_regex, content):
+ # Save files to db
+ self._download_file(
+ match.group(3),
+ assessment_item=assessment_item,
+ preset=format_presets.EXERCISE,
+ )
diff --git a/contentcuration/contentcuration/utils/storage/__init__.py b/contentcuration/contentcuration/utils/storage/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/contentcuration/contentcuration/utils/storage/base.py b/contentcuration/contentcuration/utils/storage/base.py
new file mode 100644
index 0000000000..a78e54153f
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/base.py
@@ -0,0 +1,97 @@
+from django.core.files.storage import Storage as BaseStorage
+
+
+class Storage(BaseStorage):
+ def writeable(self):
+ """
+ :rtype: bool
+ """
+ return True
+
+ def get_client(self):
+ """
+ :rtype: object
+ """
+ return None
+
+ def get_presigned_put_url(
+ self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+ ):
+ """
+ Creates a pre-signed URL for uploading files.
+
+ :param filepath: A string representing the destination file path inside the bucket
+ :param md5sum: A MD5 checksum of the file to be uploaded
+ :param lifetime_sec: The lifetime of the URL in seconds
+ :param mimetype: The content type of the file to be uploaded
+ :return: A pre-signed URL for uploading the file
+ """
+ raise NotImplementedError("Subclasses must implement this method")
+
+
+class CompositeStorage(Storage):
+ def __init__(self):
+ self.backends = []
+
+ def _get_writeable_backend(self):
+ """
+ :rtype: Storage
+ """
+ for backend in self.backends:
+ if backend.writeable:
+ return backend
+ raise AssertionError("No writeable backend found")
+
+ def _get_readable_backend(self, name):
+ """
+ :rtype: Storage
+ """
+ for backend in self.backends:
+ if backend.exists(name):
+ return backend
+ raise FileNotFoundError("{} not found".format(name))
+
+ def get_client(self):
+ return self._get_writeable_backend().get_client()
+
+ def open(self, name, mode="rb"):
+ return self._get_readable_backend(name).open(name, mode)
+
+ def save(self, name, content, max_length=None):
+ return self._get_writeable_backend().save(name, content, max_length=max_length)
+
+ def delete(self, name):
+ self._get_writeable_backend().delete(name)
+
+ def exists(self, name):
+ try:
+ self._get_readable_backend(name)
+ return True
+ except FileNotFoundError:
+ return False
+
+ def listdir(self, path):
+ # This method was not implemented on GoogleCloudStorage to begin with
+ raise NotImplementedError("listdir is not implemented for CompositeStorage")
+
+ def size(self, name):
+ return self._get_readable_backend(name).size(name)
+
+ def url(self, name):
+ return self._get_readable_backend(name).url(name)
+
+ def get_accessed_time(self, name):
+ return self._get_readable_backend(name).get_accessed_time(name)
+
+ def get_created_time(self, name):
+ return self._get_readable_backend(name).get_created_time(name)
+
+ def get_modified_time(self, name):
+ return self._get_readable_backend(name).get_modified_time(name)
+
+ def get_presigned_put_url(
+ self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+ ):
+ return self._get_writeable_backend().get_presigned_put_url(
+ filepath, md5sum, lifetime_sec, mimetype=mimetype
+ )
diff --git a/contentcuration/contentcuration/utils/storage/common.py b/contentcuration/contentcuration/utils/storage/common.py
new file mode 100644
index 0000000000..6a40768720
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/common.py
@@ -0,0 +1,69 @@
+import mimetypes
+import os
+
+from django.core.files.storage import default_storage
+
+from .base import CompositeStorage
+from .base import Storage
+
+
+# Do this to ensure that we infer mimetypes for files properly, specifically
+# zip file and epub files.
+# to add additional files add them to the mime.types file
+mimetypes.init([os.path.join(os.path.dirname(__file__), "mime.types")])
+
+
+class UnknownStorageBackendError(Exception):
+ pass
+
+
+def determine_content_type(filename):
+ """
+ Guesses the content type of a filename. Returns the mimetype of a file.
+
+ Returns "application/octet-stream" if the type can't be guessed.
+ Raises an AssertionError if filename is not a string.
+ """
+
+ typ, _ = mimetypes.guess_type(filename)
+
+ if not typ:
+ return "application/octet-stream"
+ return typ
+
+
+def get_presigned_upload_url(
+ filepath,
+ md5sum_b64,
+ lifetime_sec,
+ storage=default_storage,
+):
+ """
+ Return a presigned URL that can modify the given filepath through a PUT
+ request. Performing a PUT request on the returned URL changes the object's
+ contents with the contents of your PUT request.
+
+ :param: filepath: the file path inside the bucket, to the file.
+ :param: md5sum_b64: the base64 encoded md5 hash of the file. The holder of the URL will
+ have to set a Content-MD5 HTTP header matching this md5sum once it
+ initiates the download.
+ :param: lifetime_sec: the lifetime of the generated upload url, in seconds.
+
+ :returns: a dictionary containing 2 keys:
+ mimetype: the mimetype that will be required to send as part of the file upload's mimetype header
+ uploadURL: the URL to upload the file to.
+
+ :raises: :class:`UnknownStorageBackendError`: If the storage backend is not S3 or GCS.
+ """
+ mimetype = determine_content_type(filepath)
+
+ if isinstance(storage, (Storage, CompositeStorage)):
+ upload_url = storage.get_presigned_put_url(
+ filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
+ )
+ else:
+ raise UnknownStorageBackendError(
+ "Please ensure your storage backend is either Google Cloud Storage or S3 Storage!"
+ )
+
+ return {"mimetype": mimetype, "uploadURL": upload_url}
diff --git a/contentcuration/contentcuration/utils/storage/dev.py b/contentcuration/contentcuration/utils/storage/dev.py
new file mode 100644
index 0000000000..7e77a6e305
--- /dev/null
+++ b/contentcuration/contentcuration/utils/storage/dev.py
@@ -0,0 +1,53 @@
+from django.conf import settings
+from django_s3_storage.storage import S3Storage
+from google.cloud.storage import Client
+
+from contentcuration.utils.storage.base import CompositeStorage as BaseCompositeStorage
+from contentcuration.utils.storage.base import Storage as BaseStorage
+from contentcuration.utils.storage.gcs import GoogleCloudStorage
+
+
+class Storage(S3Storage, BaseStorage):
+ def get_client(self):
+ """
+ :rtype: botocore.client.BaseClient
+ """
+ return self.s3_connection
+
+ def get_presigned_put_url(self, filepath, md5sum, lifetime_sec, mimetype=None):
+ """
+ Creates a pre-signed URL for development storage backends
+
+ Note that since our production object storage backend is GCS, we do not enforce or require
+ any Content-MD5 value.
+
+ :param: filepath: the file path inside the bucket that the user can PUT their object.
+ :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
+ This is ignored for this function and added solely to maintain API compatibility with other
+ private presigned URL functions.
+ :param: lifetime_sec: how long before the presigned URL expires, in seconds.
+ :param: mimetype: the content type of the file to be uploaded
+ :return: A pre-signed URL for uploading the file
+ """
+ # S3's PUT Object parameters:
+ # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
+ method = "put_object"
+ fields = {
+ "Bucket": settings.AWS_S3_BUCKET_NAME,
+ "Key": filepath,
+ }
+
+ return self.get_client().generate_presigned_url(
+ ClientMethod=method,
+ Params=fields,
+ ExpiresIn=lifetime_sec,
+ )
+
+
+class CompositeStorage(BaseCompositeStorage):
+ def __init__(self):
+ super(CompositeStorage, self).__init__()
+ self.backends.append(Storage())
+ self.backends.append(
+ GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
+ )
diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/storage/gcs.py
similarity index 80%
rename from contentcuration/contentcuration/utils/gcs_storage.py
rename to contentcuration/contentcuration/utils/storage/gcs.py
index 5c4a425aec..38e0347d2d 100644
--- a/contentcuration/contentcuration/utils/gcs_storage.py
+++ b/contentcuration/contentcuration/utils/storage/gcs.py
@@ -1,16 +1,19 @@
import logging
import tempfile
+from datetime import timedelta
from gzip import GzipFile
from io import BytesIO
import backoff
from django.conf import settings
from django.core.files import File
-from django.core.files.storage import Storage
from google.cloud.exceptions import InternalServerError
from google.cloud.storage import Client
from google.cloud.storage.blob import Blob
+from contentcuration.utils.storage.base import CompositeStorage
+from contentcuration.utils.storage.base import Storage
+
OLD_STUDIO_STORAGE_PREFIX = "/contentworkshop_content/"
CONTENT_DATABASES_MAX_AGE = 5 # seconds
@@ -122,7 +125,7 @@ def save(self, name, fobj, max_length=None, blob_object=None):
# determine the current file's mimetype based on the name
# import determine_content_type lazily in here, so we don't get into an infinite loop with circular dependencies
- from contentcuration.utils.storage_common import determine_content_type
+ from contentcuration.utils.storage.common import determine_content_type
content_type = determine_content_type(name)
@@ -215,10 +218,41 @@ def _is_file_empty(fobj):
fobj.seek(current_location)
return len(byt) == 0
+ def get_presigned_put_url(
+ self, filepath, md5sum, lifetime_sec, mimetype="application/octet-stream"
+ ):
+ """
+ Creates a pre-signed URL for GCS.
+
+ :param filepath: A string representing the destination file path inside the bucket
+ :param md5sum: A MD5 checksum of the file to be uploaded
+ :param lifetime_sec: The lifetime of the URL in seconds
+ :param mimetype: The content type of the file to be uploaded
+ :return: A pre-signed URL for uploading the file
+ """
+ blob_obj = self.bucket.blob(filepath)
+
+ # ensure the md5sum doesn't have any whitespace, including newlines.
+ # We should do the same whitespace stripping as well on any client that actually
+ # uses the returned presigned url.
+ md5sum_stripped = md5sum.strip()
+
+ # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
+ # as the seconds from right now. If we use an absolute integer, it's the number of seconds
+ # from unix time
+ lifetime_timedelta = timedelta(seconds=lifetime_sec)
+
+ return blob_obj.generate_signed_url(
+ method="PUT",
+ content_md5=md5sum_stripped,
+ content_type=mimetype,
+ expiration=lifetime_timedelta,
+ )
+
-class CompositeGCS(Storage):
+class CompositeGCS(CompositeStorage):
def __init__(self):
- self.backends = []
+ super(CompositeGCS, self).__init__()
self.backends.append(
GoogleCloudStorage(_create_default_client(), settings.AWS_S3_BUCKET_NAME)
)
@@ -227,59 +261,3 @@ def __init__(self):
self.backends.append(
GoogleCloudStorage(Client.create_anonymous_client(), "studio-content")
)
-
- def _get_writeable_backend(self):
- """
- :rtype: GoogleCloudStorage
- """
- for backend in self.backends:
- if backend.writeable:
- return backend
- raise AssertionError("No writeable backend found")
-
- def _get_readable_backend(self, name):
- """
- :rtype: GoogleCloudStorage
- """
- for backend in self.backends:
- if backend.exists(name):
- return backend
- raise FileNotFoundError("{} not found".format(name))
-
- def get_client(self):
- return self._get_writeable_backend().get_client()
-
- def open(self, name, mode="rb"):
- return self._get_readable_backend(name).open(name, mode)
-
- def save(self, name, content, max_length=None):
- return self._get_writeable_backend().save(name, content, max_length=max_length)
-
- def delete(self, name):
- self._get_writeable_backend().delete(name)
-
- def exists(self, name):
- try:
- self._get_readable_backend(name)
- return True
- except FileNotFoundError:
- return False
-
- def listdir(self, path):
- # This method was not implemented on GoogleCloudStorage to begin with
- raise NotImplementedError("listdir is not implemented for CompositeGCS")
-
- def size(self, name):
- return self._get_readable_backend(name).size(name)
-
- def url(self, name):
- return self._get_readable_backend(name).url(name)
-
- def get_accessed_time(self, name):
- return self._get_readable_backend(name).get_accessed_time(name)
-
- def get_created_time(self, name):
- return self._get_readable_backend(name).get_created_time(name)
-
- def get_modified_time(self, name):
- return self._get_readable_backend(name).get_modified_time(name)
diff --git a/contentcuration/contentcuration/utils/storage_common.py b/contentcuration/contentcuration/utils/storage_common.py
deleted file mode 100644
index 10d79bd5c5..0000000000
--- a/contentcuration/contentcuration/utils/storage_common.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import mimetypes
-import os
-from datetime import timedelta
-
-from django.conf import settings
-from django.core.files.storage import default_storage
-from django_s3_storage.storage import S3Storage
-
-from .gcs_storage import CompositeGCS
-from .gcs_storage import GoogleCloudStorage
-
-
-# Do this to ensure that we infer mimetypes for files properly, specifically
-# zip file and epub files.
-# to add additional files add them to the mime.types file
-mimetypes.init([os.path.join(os.path.dirname(__file__), "mime.types")])
-
-
-class UnknownStorageBackendError(Exception):
- pass
-
-
-def determine_content_type(filename):
- """
- Guesses the content type of a filename. Returns the mimetype of a file.
-
- Returns "application/octet-stream" if the type can't be guessed.
- Raises an AssertionError if filename is not a string.
- """
-
- typ, _ = mimetypes.guess_type(filename)
-
- if not typ:
- return "application/octet-stream"
- return typ
-
-
-def get_presigned_upload_url(
- filepath,
- md5sum_b64,
- lifetime_sec,
- content_length,
- storage=default_storage,
- client=None,
-):
- """Return a presigned URL that can modify the given filepath through a PUT
- request. Performing a PUT request on the returned URL changes the object's
- contents with the contents of your PUT request.
-
- :param: filepath: the file path inside the bucket, to the file.
- :param: md5sum_b64: the base64 encoded md5 hash of the file. The holder of the URL will
- have to set a Content-MD5 HTTP header matching this md5sum once it
- initiates the download.
- :param: lifetime_sec: the lifetime of the generated upload url, in seconds.
- :param: content_length: the size of the content, in bytes.
- :param: client: the storage client that will be used to gennerate the presigned URL.
- This must have an API that's similar to either the GCS client or the boto3 client.
-
- :returns: a dictionary containing 2 keys:
- mimetype: the mimetype that will be required to send as part of the file upload's mimetype header
- uploadURL: the URL to upload the file to.
-
- :raises: :class:`UnknownStorageBackendError`: If the storage backend is not S3 or GCS.
- """
-
- # Aron: note that content_length is not used right now because
- # both storage types are having difficulties enforcing it.
-
- mimetype = determine_content_type(filepath)
- if isinstance(storage, (GoogleCloudStorage, CompositeGCS)):
- client = client or storage.get_client()
- bucket = settings.AWS_S3_BUCKET_NAME
- upload_url = _get_gcs_presigned_put_url(
- client, bucket, filepath, md5sum_b64, lifetime_sec, mimetype=mimetype
- )
- elif isinstance(storage, S3Storage):
- bucket = settings.AWS_S3_BUCKET_NAME
- client = client or storage.s3_connection
- upload_url = _get_s3_presigned_put_url(
- client, bucket, filepath, md5sum_b64, lifetime_sec
- )
- else:
- raise UnknownStorageBackendError(
- "Please ensure your storage backend is either Google Cloud Storage or S3 Storage!"
- )
-
- return {"mimetype": mimetype, "uploadURL": upload_url}
-
-
-def _get_gcs_presigned_put_url(
- gcs_client,
- bucket,
- filepath,
- md5sum,
- lifetime_sec,
- mimetype="application/octet-stream",
-):
- bucket_obj = gcs_client.get_bucket(bucket)
- blob_obj = bucket_obj.blob(filepath)
-
- # ensure the md5sum doesn't have any whitespace, including newlines.
- # We should do the same whitespace stripping as well on any client that actually
- # uses the returned presigned url.
- md5sum_stripped = md5sum.strip()
-
- # convert the lifetime to a timedelta, so gcloud library will interpret the lifetime
- # as the seconds from right now. If we use an absolute integer, it's the number of seconds
- # from unix time
- lifetime_timedelta = timedelta(seconds=lifetime_sec)
-
- url = blob_obj.generate_signed_url(
- method="PUT",
- content_md5=md5sum_stripped,
- content_type=mimetype,
- expiration=lifetime_timedelta,
- )
-
- return url
-
-
-def _get_s3_presigned_put_url(s3_client, bucket, filepath, md5sum, lifetime_sec):
- """
- Creates a pre-signed URL for S3-like backends, e.g. Minio.
-
- Note that since our production object storage backend is GCS, we do not enforce or require
- any Content-MD5 value.
-
- :param: s3_client: an initialized S3 client. We will use this to create the presigned PUT url.
- :param: bucket: the bucket where the user can PUT their object.
- :param: filepath: the file path inside the bucket that the user can PUT their object.
- :param: md5sum: the base64-encoded MD5sum of the object the user is planning to PUT.
- This is ignored for this function and added solely to maintain API compatibility with other
- private presigned URL functions.
- :param: lifetime_sec: how long before the presigned URL expires, in seconds.
- """
- # S3's PUT Object parameters:
- # https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html
- method = "put_object"
- fields = {
- "Bucket": bucket,
- "Key": filepath,
- }
-
- response = s3_client.generate_presigned_url(
- ClientMethod=method,
- Params=fields,
- ExpiresIn=lifetime_sec,
- )
-
- return response
diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py
index afadbff0cb..f2e3444686 100644
--- a/contentcuration/contentcuration/viewsets/file.py
+++ b/contentcuration/contentcuration/viewsets/file.py
@@ -18,7 +18,7 @@
from contentcuration.models import generate_storage_url
from contentcuration.utils.cache import ResourceSizeCache
from contentcuration.utils.sentry import report_exception
-from contentcuration.utils.storage_common import get_presigned_upload_url
+from contentcuration.utils.storage.common import get_presigned_upload_url
from contentcuration.utils.user import calculate_user_storage
from contentcuration.viewsets.base import BulkDeleteMixin
from contentcuration.viewsets.base import BulkListSerializer
@@ -252,9 +252,7 @@ def upload_url(self, request):
checksum_base64 = codecs.encode(
codecs.decode(checksum, "hex"), "base64"
).decode()
- retval = get_presigned_upload_url(
- filepath, checksum_base64, 600, content_length=size
- )
+ retval = get_presigned_upload_url(filepath, checksum_base64, 600)
file = File(
file_size=size,
diff --git a/docker-compose.yml b/docker-compose.yml
index 3a07894c8d..b6d0b0069a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,7 +14,8 @@ x-studio-environment:
CELERY_BROKER_ENDPOINT: redis
CELERY_RESULT_BACKEND_ENDPOINT: redis
CELERY_REDIS_PASSWORD: ""
- PROBER_STUDIO_BASE_URL: http://studio-app:8080/{path}
+ PROBER_STUDIO_BASE_URL: http://studio-app:8081/{path}
+ WEBPACK_DEV_HOST: 0.0.0.0
x-studio-worker:
&studio-worker
@@ -36,10 +37,7 @@ services:
build:
context: .
dockerfile: k8s/images/nginx/Dockerfile
- ports:
- - "8081:8080"
- depends_on:
- - studio-app
+ network_mode: host
environment: *studio-environment
studio-app:
@@ -47,7 +45,7 @@ services:
entrypoint: python docker/entrypoint.py
command: pnpm run devserver
ports:
- - "8080:8080"
+ - "8081:8081"
- "4000:4000"
celery-worker:
diff --git a/package.json b/package.json
index 7f5764f22d..bf86ea58c8 100644
--- a/package.json
+++ b/package.json
@@ -25,7 +25,7 @@
"test-jest": "pnpm run test",
"test-jest:debug": "node --inspect node_modules/.bin/jest --runInBand --watch",
"minio": "MINIO_API_CORS_ALLOW_ORIGIN='http://localhost:8080,http://127.0.0.1:8080' MINIO_ACCESS_KEY=development MINIO_SECRET_KEY=development minio server ~/.minio_data/ || true",
- "runserver": "cd contentcuration && python manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8080",
+ "runserver": "cd contentcuration && python manage.py runserver --settings=contentcuration.dev_settings 0.0.0.0:8081",
"devserver": "npm-run-all --parallel build:dev runserver",
"devserver:hot": "npm-run-all --parallel build:dev:hot runserver",
"devserver-hot": "pnpm run devserver:hot",
diff --git a/requirements-dev.in b/requirements-dev.in
index bd1d8385e8..02c2458af5 100644
--- a/requirements-dev.in
+++ b/requirements-dev.in
@@ -9,3 +9,4 @@ pre-commit==4.5.0
nodeenv
pip-tools==7.5.2
drf-yasg==1.21.10
+tqdm
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b1442ddbc2..50528b8eb2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -113,6 +113,8 @@ tomli==1.2.3
# build
# pip-tools
# pytest
+tqdm==4.67.1
+ # via -r requirements-dev.in
typing-extensions==4.15.0
# via
# -c requirements.txt
@@ -123,3 +125,7 @@ virtualenv==20.26.6
# via pre-commit
wheel==0.38.1
# via pip-tools
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
diff --git a/webpack.config.js b/webpack.config.js
index d055936b79..7da3a2dd61 100644
--- a/webpack.config.js
+++ b/webpack.config.js
@@ -16,7 +16,12 @@ const WebpackRTLPlugin = require('kolibri-tools/lib/webpackRtlPlugin');
const { InjectManifest } = require('workbox-webpack-plugin');
-// Function to detect if running in WSL
+const DEFAULT_WEBPACK_DEV_HOST = '127.0.0.1';
+
+/**
+ * Function to detect if running in WSL
+ * @return {boolean}
+ */
function isWSL() {
try {
const version = fs.readFileSync('/proc/version', 'utf8');
@@ -26,14 +31,24 @@ function isWSL() {
}
}
-// Function to get WSL IP address
-function getWSLIP() {
+/**
+ * Get the host for the webpack dev server.
+ * @return {string}
+ */
+function getWebpackDevHost() {
+ if (process.env.WEBPACK_DEV_HOST) {
+ return process.env.WEBPACK_DEV_HOST;
+ }
+
+ if (!isWSL()) {
+ return DEFAULT_WEBPACK_DEV_HOST;
+ }
+
try {
- const ip = execSync('hostname -I').toString().trim().split(' ')[0];
- return ip;
+ return execSync('hostname -I').toString().trim().split(' ')[0];
} catch (err) {
console.warn('Failed to get WSL IP address:', err);
- return '127.0.0.1';
+ return DEFAULT_WEBPACK_DEV_HOST;
}
}
@@ -60,11 +75,8 @@ module.exports = (env = {}) => {
const pnpmNodeModules = path.join(rootDir, 'node_modules', '.pnpm', 'node_modules');
// Determine the appropriate dev server host and public path based on environment
- const isWSLEnvironment = isWSL();
- const devServerHost = isWSLEnvironment ? '0.0.0.0' : '127.0.0.1';
- const devPublicPath = isWSLEnvironment ?
- `http://${getWSLIP()}:4000/dist/` :
- 'http://127.0.0.1:4000/dist/';
+ const devServerHost = getWebpackDevHost();
+ const devPublicPath = `http://${devServerHost}:4000/dist/`;
const workboxPlugin = new InjectManifest({
swSrc: path.resolve(srcDir, 'serviceWorker/index.js'),
@@ -120,10 +132,8 @@ module.exports = (env = {}) => {
allowedHosts: [
'127.0.0.1',
'localhost',
- ].concat(
- // For WSL, allow the WSL IP address
- isWSLEnvironment ? [getWSLIP()] : []
- ),
+ getWebpackDevHost(),
+ ]
},
module: {
rules: [