diff --git a/docs/decisions/0015-serving-static-assets.rst b/docs/decisions/0015-serving-static-assets.rst index cd8a5845..a3586976 100644 --- a/docs/decisions/0015-serving-static-assets.rst +++ b/docs/decisions/0015-serving-static-assets.rst @@ -18,7 +18,7 @@ Data Storage Implementation The underlying data models live in the openedx-learning repo. The most relevant models are: -* `RawContent in contents/models.py `_ +* `Content in contents/models.py `_ * `Component and ComponentVersion in components/models.py `_ Key takeaways about how this data is stored: diff --git a/olx_importer/management/commands/load_components.py b/olx_importer/management/commands/load_components.py index 575f01ed..49c07b1e 100644 --- a/olx_importer/management/commands/load_components.py +++ b/olx_importer/management/commands/load_components.py @@ -11,7 +11,7 @@ Open Question: If the data model is extensible, how do we know whether a change has really happened between what's currently stored/published for a particular -item and the new value we want to set? For RawContent that's easy, because we +item and the new value we want to set? For Content that's easy, because we have actual hashes of the data. But it's not clear how that would work for something like an ComponentVersion. We'd have to have some kind of mechanism where every pp that wants to attach data gets to answer the question of "has anything @@ -116,20 +116,20 @@ def create_content(self, static_local_path, now, component_version): logger.warning(f' Static reference not found: "{real_path}"') return # Might as well bail if we can't find the file. - raw_content, _created = contents_api.get_or_create_raw_content( + content = contents_api.get_or_create_file_content( self.learning_package.id, - data_bytes=data_bytes, + data=data_bytes, mime_type=mime_type, created=now, ) - components_api.add_content_to_component_version( + components_api.create_component_version_content( component_version, - raw_content_id=raw_content.id, + content.id, key=key, learner_downloadable=True, ) - def import_block_type(self, block_type, now): # , publish_log_entry): + def import_block_type(self, block_type_name, now): # , publish_log_entry): components_found = 0 components_skipped = 0 @@ -138,8 +138,8 @@ def import_block_type(self, block_type, now): # , publish_log_entry): # not fool-proof as it will match static file references that are # outside of tag declarations as well. static_files_regex = re.compile(r"""['"]\/static\/(.+?)["'\?]""") - block_data_path = self.course_data_path / block_type - namespace="xblock.v1" + block_data_path = self.course_data_path / block_type_name + block_type = components_api.get_or_create_component_type("xblock.v1", block_type_name) for xml_file_path in block_data_path.glob("*.xml"): components_found += 1 @@ -157,27 +157,26 @@ def import_block_type(self, block_type, now): # , publish_log_entry): display_name = block_root.attrib.get("display_name", "") _component, component_version = components_api.create_component_and_version( self.learning_package.id, - namespace=namespace, - type_name=block_type, + component_type=block_type, local_key=local_key, title=display_name, created=now, created_by=None, ) - # Create the RawContent entry for the raw data... - data_bytes = xml_file_path.read_bytes() - text_content, _created = contents_api.get_or_create_text_content_from_bytes( + # Create the Content entry for the raw data... + text = xml_file_path.read_text('utf-8') + text_content, _created = contents_api.get_or_create_text_content( self.learning_package.id, - data_bytes=data_bytes, - mime_type=f"application/vnd.openedx.xblock.v1.{block_type}+xml", + text=text, + mime_type=f"application/vnd.openedx.xblock.v1.{block_type_name}+xml", created=now, ) # Add the OLX source text to the ComponentVersion - components_api.add_content_to_component_version( + components_api.create_component_version_content( component_version, - raw_content_id=text_content.pk, - key="source.xml", + text_content.pk, + key="block.xml", learner_downloadable=False ) diff --git a/openedx_learning/__init__.py b/openedx_learning/__init__.py index fa8a6955..916c50b1 100644 --- a/openedx_learning/__init__.py +++ b/openedx_learning/__init__.py @@ -1,4 +1,4 @@ """ Open edX Learning ("Learning Core"). """ -__version__ = "0.5.2" +__version__ = "0.6.0" diff --git a/openedx_learning/contrib/media_server/readme.rst b/openedx_learning/contrib/media_server/readme.rst index aef39566..160dbb70 100644 --- a/openedx_learning/contrib/media_server/readme.rst +++ b/openedx_learning/contrib/media_server/readme.rst @@ -1,7 +1,7 @@ Media Server App ================ -The ``media_server`` app exists to serve media files that are ultimately backed by the RawContent model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites. +The ``media_server`` app exists to serve media files that are ultimately backed by the Content model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites. Motivation ---------- diff --git a/openedx_learning/contrib/media_server/views.py b/openedx_learning/contrib/media_server/views.py index 752c915c..79a8abdd 100644 --- a/openedx_learning/contrib/media_server/views.py +++ b/openedx_learning/contrib/media_server/views.py @@ -8,7 +8,7 @@ from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.http import FileResponse, Http404 -from openedx_learning.core.components.api import get_component_version_content +from openedx_learning.core.components.api import look_up_component_version_content def component_asset( @@ -28,7 +28,7 @@ def component_asset( * Serving from a different domain than the rest of the service """ try: - cvc = get_component_version_content( + cvc = look_up_component_version_content( learning_package_key, component_key, version_num, asset_path ) except ObjectDoesNotExist: diff --git a/openedx_learning/core/components/admin.py b/openedx_learning/core/components/admin.py index 1e84391b..f09483e9 100644 --- a/openedx_learning/core/components/admin.py +++ b/openedx_learning/core/components/admin.py @@ -9,7 +9,7 @@ from openedx_learning.lib.admin_utils import ReadOnlyModelAdmin -from .models import Component, ComponentVersion, ComponentVersionRawContent +from .models import Component, ComponentVersion, ComponentVersionContent class ComponentVersionInline(admin.TabularInline): @@ -48,18 +48,18 @@ class ComponentAdmin(ReadOnlyModelAdmin): inlines = [ComponentVersionInline] -class RawContentInline(admin.TabularInline): +class ContentInline(admin.TabularInline): """ - Django admin configuration for RawContent + Django admin configuration for Content """ - model = ComponentVersion.raw_contents.through + model = ComponentVersion.contents.through def get_queryset(self, request): queryset = super().get_queryset(request) return queryset.select_related( - "raw_content", - "raw_content__learning_package", - "raw_content__text_content", + "content", + "content__learning_package", + "content__media_type", "component_version", "component_version__publishable_entity_version", "component_version__component", @@ -73,7 +73,7 @@ def get_queryset(self, request): "rendered_data", ] readonly_fields = [ - "raw_content", + "content", "format_key", "format_size", "rendered_data", @@ -85,7 +85,7 @@ def rendered_data(self, cvc_obj): @admin.display(description="Size") def format_size(self, cvc_obj): - return filesizeformat(cvc_obj.raw_content.size) + return filesizeformat(cvc_obj.content.size) @admin.display(description="Key") def format_key(self, cvc_obj): @@ -108,7 +108,7 @@ class ComponentVersionAdmin(ReadOnlyModelAdmin): "title", "version_num", "created", - "raw_contents", + "contents", ] fields = [ "component", @@ -118,7 +118,7 @@ class ComponentVersionAdmin(ReadOnlyModelAdmin): "created", ] list_display = ["component", "version_num", "uuid", "created"] - inlines = [RawContentInline] + inlines = [ContentInline] def get_queryset(self, request): queryset = super().get_queryset(request) @@ -129,12 +129,12 @@ def get_queryset(self, request): ) -def link_for_cvc(cvc_obj: ComponentVersionRawContent) -> str: +def link_for_cvc(cvc_obj: ComponentVersionContent) -> str: """ - Get the download URL for the given ComponentVersionRawContent instance + Get the download URL for the given ComponentVersionContent instance """ return "/media_server/component_asset/{}/{}/{}/{}".format( - cvc_obj.raw_content.learning_package.key, + cvc_obj.content.learning_package.key, cvc_obj.component_version.component.key, cvc_obj.component_version.version_num, cvc_obj.key, @@ -151,27 +151,18 @@ def format_text_for_admin_display(text: str) -> SafeText: ) -def content_preview(cvc_obj: ComponentVersionRawContent) -> SafeText: +def content_preview(cvc_obj: ComponentVersionContent) -> SafeText: """ - Get the HTML to display a preview of the given ComponentVersionRawContent + Get the HTML to display a preview of the given ComponentVersionContent """ - raw_content_obj = cvc_obj.raw_content + content_obj = cvc_obj.content - if raw_content_obj.media_type.type == "image": + if content_obj.media_type.type == "image": return format_html( '', - # TODO: configure with settings value: - "/media_server/component_asset/{}/{}/{}/{}".format( - cvc_obj.raw_content.learning_package.key, - cvc_obj.component_version.component.key, - cvc_obj.component_version.version_num, - cvc_obj.key, - ), + content_obj.file_url(), ) - if hasattr(raw_content_obj, "text_content"): - return format_text_for_admin_display( - raw_content_obj.text_content.text, - ) - - return format_html("This content type cannot be displayed.") + return format_text_for_admin_display( + content_obj.text or "" + ) diff --git a/openedx_learning/core/components/api.py b/openedx_learning/core/components/api.py index 2bec7b34..d0c6632f 100644 --- a/openedx_learning/core/components/api.py +++ b/openedx_learning/core/components/api.py @@ -18,28 +18,32 @@ from django.db.models import Q, QuerySet from django.db.transaction import atomic -from ...lib.cache import lru_cache from ..publishing import api as publishing_api -from .models import Component, ComponentType, ComponentVersion, ComponentVersionRawContent +from .models import Component, ComponentType, ComponentVersion, ComponentVersionContent -@lru_cache(maxsize=128) -def get_or_create_component_type_id(namespace: str, name: str) -> int: +def get_or_create_component_type(namespace: str, name: str) -> ComponentType: """ Get the ID of a ComponentType, and create if missing. + + Caching Warning: Be careful about putting any caching decorator around this + function (e.g. ``lru_cache``). It's possible that incorrect cache values + could leak out in the event of a rollback–e.g. new types are introduced in + a large import transaction which later fails. You can safely cache the + results that come back from this function with a local dict in your import + process instead.# """ component_type, _created = ComponentType.objects.get_or_create( namespace=namespace, name=name, ) - return component_type.id + return component_type def create_component( learning_package_id: int, /, - namespace: str, - type_name: str, + component_type: ComponentType, local_key: str, created: datetime, created_by: int | None, @@ -47,7 +51,7 @@ def create_component( """ Create a new Component (an entity like a Problem or Video) """ - key = f"{namespace}:{type_name}@{local_key}" + key = f"{component_type.namespace}:{component_type.name}@{local_key}" with atomic(): publishable_entity = publishing_api.create_publishable_entity( learning_package_id, key, created, created_by @@ -55,7 +59,7 @@ def create_component( component = Component.objects.create( publishable_entity=publishable_entity, learning_package_id=learning_package_id, - component_type_id=get_or_create_component_type_id(namespace, type_name), + component_type=component_type, local_key=local_key, ) return component @@ -101,10 +105,10 @@ def create_next_version( A very common pattern for making a new ComponentVersion is going to be "make it just like the last version, except changing these one or two things". Before calling this, you should create any new contents via the contents - API, since ``content_to_replace`` needs RawContent IDs for the values. + API, since ``content_to_replace`` needs Content IDs for the values. The ``content_to_replace`` dict is a mapping of strings representing the - local path/key for a file, to ``RawContent.id`` values. Using a `None` for + local path/key for a file, to ``Content.id`` values. Using a `None` for a value in this dict means to delete that key in the next version. It is okay to mark entries for deletion that don't exist. For instance, if a @@ -144,25 +148,25 @@ def create_next_version( component_id=component_pk, ) # First copy the new stuff over... - for key, raw_content_pk in content_to_replace.items(): - # If the raw_content_pk is None, it means we want to remove the + for key, content_pk in content_to_replace.items(): + # If the content_pk is None, it means we want to remove the # content represented by our key from the next version. Otherwise, - # we add our key->raw_content_pk mapping to the next version. - if raw_content_pk is not None: - ComponentVersionRawContent.objects.create( - raw_content_id=raw_content_pk, + # we add our key->content_pk mapping to the next version. + if content_pk is not None: + ComponentVersionContent.objects.create( + content_id=content_pk, component_version=component_version, key=key, learner_downloadable=False, ) # Now copy any old associations that existed, as long as they aren't # in conflict with the new stuff or marked for deletion. - last_version_content_mapping = ComponentVersionRawContent.objects \ - .filter(component_version=last_version) + last_version_content_mapping = ComponentVersionContent.objects \ + .filter(component_version=last_version) for cvrc in last_version_content_mapping: if cvrc.key not in content_to_replace: - ComponentVersionRawContent.objects.create( - raw_content_id=cvrc.raw_content_id, + ComponentVersionContent.objects.create( + content_id=cvrc.content_id, component_version=component_version, key=cvrc.key, learner_downloadable=cvrc.learner_downloadable, @@ -174,8 +178,7 @@ def create_next_version( def create_component_and_version( learning_package_id: int, /, - namespace: str, - type_name: str, + component_type: ComponentType, local_key: str, title: str, created: datetime, @@ -186,7 +189,7 @@ def create_component_and_version( """ with atomic(): component = create_component( - learning_package_id, namespace, type_name, local_key, created, created_by + learning_package_id, component_type, local_key, created, created_by ) component_version = create_component_version( component.pk, @@ -282,27 +285,29 @@ def get_components( qset = qset.filter(component_type__name__in=type_names) if draft_title is not None: qset = qset.filter( - publishable_entity__draft__version__title__icontains=draft_title + Q(publishable_entity__draft__version__title__icontains=draft_title) | + Q(local_key__icontains=draft_title) ) if published_title is not None: qset = qset.filter( - publishable_entity__published__version__title__icontains=published_title + Q(publishable_entity__published__version__title__icontains=published_title) | + Q(local_key__icontains=published_title) ) return qset -def get_component_version_content( +def look_up_component_version_content( learning_package_key: str, component_key: str, version_num: int, key: Path, -) -> ComponentVersionRawContent: +) -> ComponentVersionContent: """ - Look up ComponentVersionRawContent by human readable keys. + Look up ComponentVersionContent by human readable keys. Can raise a django.core.exceptions.ObjectDoesNotExist error if there is no - matching ComponentVersionRawContent. + matching ComponentVersionContent. """ queries = ( Q(component_version__component__learning_package__key=learning_package_key) @@ -310,30 +315,29 @@ def get_component_version_content( & Q(component_version__publishable_entity_version__version_num=version_num) & Q(key=key) ) - return ComponentVersionRawContent.objects \ - .select_related( - "raw_content", - "raw_content__media_type", - "raw_content__textcontent", - "component_version", - "component_version__component", - "component_version__component__learning_package", - ).get(queries) - - -def add_content_to_component_version( + return ComponentVersionContent.objects \ + .select_related( + "content", + "content__media_type", + "component_version", + "component_version__component", + "component_version__component__learning_package", + ).get(queries) + + +def create_component_version_content( component_version_id: int, + content_id: int, /, - raw_content_id: int, key: str, learner_downloadable=False, -) -> ComponentVersionRawContent: +) -> ComponentVersionContent: """ - Add a RawContent to the given ComponentVersion + Add a Content to the given ComponentVersion """ - cvrc, _created = ComponentVersionRawContent.objects.get_or_create( + cvrc, _created = ComponentVersionContent.objects.get_or_create( component_version_id=component_version_id, - raw_content_id=raw_content_id, + content_id=content_id, key=key, learner_downloadable=learner_downloadable, ) diff --git a/openedx_learning/core/components/migrations/0001_initial.py b/openedx_learning/core/components/migrations/0001_initial.py index b72d1aee..ece93796 100644 --- a/openedx_learning/core/components/migrations/0001_initial.py +++ b/openedx_learning/core/components/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.23 on 2024-01-31 05:34 +# Generated by Django 3.2.23 on 2024-02-06 03:23 import uuid @@ -13,8 +13,8 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ('oel_publishing', '0001_initial'), ('oel_contents', '0001_initial'), + ('oel_publishing', '0001_initial'), ] operations = [ @@ -49,20 +49,20 @@ class Migration(migrations.Migration): }, ), migrations.CreateModel( - name='ComponentVersionRawContent', + name='ComponentVersionContent', fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), ('key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), ('learner_downloadable', models.BooleanField(default=False)), ('component_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_components.componentversion')), - ('raw_content', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='oel_contents.rawcontent')), + ('content', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='oel_contents.content')), ], ), migrations.AddField( model_name='componentversion', - name='raw_contents', - field=models.ManyToManyField(related_name='component_versions', through='oel_components.ComponentVersionRawContent', to='oel_contents.RawContent'), + name='contents', + field=models.ManyToManyField(related_name='component_versions', through='oel_components.ComponentVersionContent', to='oel_contents.Content'), ), migrations.AddField( model_name='component', @@ -75,16 +75,16 @@ class Migration(migrations.Migration): field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage'), ), migrations.AddIndex( - model_name='componentversionrawcontent', - index=models.Index(fields=['raw_content', 'component_version'], name='oel_cvrawcontent_c_cv'), + model_name='componentversioncontent', + index=models.Index(fields=['content', 'component_version'], name='oel_cvcontent_c_cv'), ), migrations.AddIndex( - model_name='componentversionrawcontent', - index=models.Index(fields=['component_version', 'raw_content'], name='oel_cvrawcontent_cv_d'), + model_name='componentversioncontent', + index=models.Index(fields=['component_version', 'content'], name='oel_cvcontent_cv_d'), ), migrations.AddConstraint( - model_name='componentversionrawcontent', - constraint=models.UniqueConstraint(fields=('component_version', 'key'), name='oel_cvrawcontent_uniq_cv_key'), + model_name='componentversioncontent', + constraint=models.UniqueConstraint(fields=('component_version', 'key'), name='oel_cvcontent_uniq_cv_key'), ), migrations.AddIndex( model_name='component', diff --git a/openedx_learning/core/components/models.py b/openedx_learning/core/components/models.py index 920b3efb..59dc7700 100644 --- a/openedx_learning/core/components/models.py +++ b/openedx_learning/core/components/models.py @@ -1,5 +1,5 @@ """ -The model hierarchy is Component -> ComponentVersion -> RawContent. +The model hierarchy is Component -> ComponentVersion -> Content. A Component is an entity like a Problem or Video. It has enough information to identify the Component and determine what the handler should be (e.g. XBlock @@ -10,20 +10,18 @@ publishing app. Component maps 1:1 to PublishableEntity and ComponentVersion maps 1:1 to PublishableEntityVersion. -Multiple pieces of RawContent may be associated with a ComponentVersion, through -the ComponentVersionRawContent model. ComponentVersionRawContent allows to -specify a ComponentVersion-local identifier. We're using this like a file path -by convention, but it's possible we might want to have special identifiers -later. +Multiple pieces of Content may be associated with a ComponentVersion, through +the ComponentVersionContent model. ComponentVersionContent allows to specify a +ComponentVersion-local identifier. We're using this like a file path by +convention, but it's possible we might want to have special identifiers later. """ from __future__ import annotations from django.db import models -from openedx_learning.lib.fields import case_sensitive_char_field, immutable_uuid_field, key_field -from openedx_learning.lib.managers import WithRelationsManager - -from ..contents.models import RawContent +from ...lib.fields import case_sensitive_char_field, immutable_uuid_field, key_field +from ...lib.managers import WithRelationsManager +from ..contents.models import Content from ..publishing.model_mixins import PublishableEntityMixin, PublishableEntityVersionMixin from ..publishing.models import LearningPackage @@ -40,6 +38,10 @@ class ComponentType(models.Model): type of Components–e.g. marking certain types of XBlocks as approved vs. experimental for use in libraries. """ + # We don't need the app default of 8-bytes for this primary key, but there + # is just a tiny chance that we'll use ComponentType in a novel, user- + # customizable way that will require more than 32K entries. So let's use a + # 4-byte primary key. id = models.AutoField(primary_key=True) # namespace and name work together to help figure out what Component needs @@ -79,7 +81,7 @@ class Component(PublishableEntityMixin): # type: ignore[django-manager-missing] Problem), but little beyond that. A Component will have many ComponentVersions over time, and most metadata is - associated with the ComponentVersion model and the RawContent that + associated with the ComponentVersion model and the Content that ComponentVersions are associated with. A Component belongs to exactly one LearningPackage. @@ -187,8 +189,8 @@ class ComponentVersion(PublishableEntityVersionMixin): """ A particular version of a Component. - This holds the content using a M:M relationship with RawContent via - ComponentVersionRawContent. + This holds the content using a M:M relationship with Content via + ComponentVersionContent. """ # Tell mypy what type our objects manager has. # It's actually PublishableEntityVersionMixinManager, but that has the exact @@ -202,11 +204,11 @@ class ComponentVersion(PublishableEntityVersionMixin): Component, on_delete=models.CASCADE, related_name="versions" ) - # The raw_contents hold the actual interesting data associated with this + # The contents hold the actual interesting data associated with this # ComponentVersion. - raw_contents: models.ManyToManyField[RawContent, ComponentVersionRawContent] = models.ManyToManyField( - RawContent, - through="ComponentVersionRawContent", + contents: models.ManyToManyField[Content, ComponentVersionContent] = models.ManyToManyField( + Content, + through="ComponentVersionContent", related_name="component_versions", ) @@ -215,32 +217,31 @@ class Meta: verbose_name_plural = "Component Versions" -class ComponentVersionRawContent(models.Model): +class ComponentVersionContent(models.Model): """ - Determines the RawContent for a given ComponentVersion. + Determines the Content for a given ComponentVersion. An ComponentVersion may be associated with multiple pieces of binary data. For instance, a Video ComponentVersion might be associated with multiple transcripts in different languages. - When RawContent is associated with an ComponentVersion, it has some local + When Content is associated with an ComponentVersion, it has some local key that is unique within the the context of that ComponentVersion. This allows the ComponentVersion to do things like store an image file and reference it by a "path" key. - RawContent is immutable and sharable across multiple ComponentVersions and - even across LearningPackages. + Content is immutable and sharable across multiple ComponentVersions. """ - raw_content = models.ForeignKey(RawContent, on_delete=models.RESTRICT) component_version = models.ForeignKey(ComponentVersion, on_delete=models.CASCADE) + content = models.ForeignKey(Content, on_delete=models.RESTRICT) uuid = immutable_uuid_field() key = key_field() # Long explanation for the ``learner_downloadable`` field: # - # Is this RawContent downloadable during the learning experience? This is + # Is this Content downloadable during the learning experience? This is # NOT about public vs. private permissions on course assets, as that will be # a policy that can be changed independently of new versions of the content. # For instance, a course team could decide to flip their course assets from @@ -282,16 +283,16 @@ class Meta: # with two different identifiers, that is permitted. models.UniqueConstraint( fields=["component_version", "key"], - name="oel_cvrawcontent_uniq_cv_key", + name="oel_cvcontent_uniq_cv_key", ), ] indexes = [ models.Index( - fields=["raw_content", "component_version"], - name="oel_cvrawcontent_c_cv", + fields=["content", "component_version"], + name="oel_cvcontent_c_cv", ), models.Index( - fields=["component_version", "raw_content"], - name="oel_cvrawcontent_cv_d", + fields=["component_version", "content"], + name="oel_cvcontent_cv_d", ), ] diff --git a/openedx_learning/core/components/readme.rst b/openedx_learning/core/components/readme.rst index 70cf5a5f..b79134bc 100644 --- a/openedx_learning/core/components/readme.rst +++ b/openedx_learning/core/components/readme.rst @@ -18,6 +18,6 @@ Architecture Guidelines * We're keeping nearly unlimited history, so per-version metadata (i.e. the space/time cost of making a new version) must be kept low. * Do not assume that all Components will be XBlocks. -* Encourage other apps to make models that join to (and add their own metadata to) Component, ComponentVersion, RawContent, TextContent etc. But it should be done in such a way that this app is not aware of them. +* Encourage other apps to make models that join to (and add their own metadata to) Component, ComponentVersion, Content, etc. But it should be done in such a way that this app is not aware of them. * Always preserve the most raw version of the data possible, e.g. OLX, even if XBlocks then extend that with more sophisticated data models. At some point those XBlocks will get deprecated/removed, and we will still want to be able to export the raw data. * Exports should be fast and *not* require the invocation of plugin code. \ No newline at end of file diff --git a/openedx_learning/core/contents/admin.py b/openedx_learning/core/contents/admin.py index 8409fe90..036d171b 100644 --- a/openedx_learning/core/contents/admin.py +++ b/openedx_learning/core/contents/admin.py @@ -6,13 +6,13 @@ from openedx_learning.lib.admin_utils import ReadOnlyModelAdmin -from .models import RawContent +from .models import Content -@admin.register(RawContent) -class RawContentAdmin(ReadOnlyModelAdmin): +@admin.register(Content) +class ContentAdmin(ReadOnlyModelAdmin): """ - Django admin for RawContent model + Django admin for Content model """ list_display = [ "hash_digest", @@ -21,6 +21,7 @@ class RawContentAdmin(ReadOnlyModelAdmin): "media_type", "size", "created", + "has_file", ] fields = [ "learning_package", @@ -30,30 +31,22 @@ class RawContentAdmin(ReadOnlyModelAdmin): "created", "file_link", "text_preview", - ] - readonly_fields = [ - "learning_package", - "hash_digest", - "media_type", - "size", - "created", - "file_link", - "text_preview", + "has_file", ] list_filter = ("media_type", "learning_package") search_fields = ("hash_digest",) - def file_link(self, raw_content): + def file_link(self, content: Content): + if not content.has_file: + return "" + return format_html( 'Download', - raw_content.file.url, + content.file_url(), ) - def text_preview(self, raw_content): - if not hasattr(raw_content, "text_content"): - return "(not available)" - + def text_preview(self, content: Content): return format_html( '
\n{}\n
', - raw_content.text_content.text, + content.text, ) diff --git a/openedx_learning/core/contents/api.py b/openedx_learning/core/contents/api.py index 0b27690a..06ec49ac 100644 --- a/openedx_learning/core/contents/api.py +++ b/openedx_learning/core/contents/api.py @@ -6,58 +6,16 @@ """ from __future__ import annotations -import codecs from datetime import datetime from django.core.files.base import ContentFile from django.db.transaction import atomic -from ...lib.cache import lru_cache from ...lib.fields import create_hash_digest -from .models import MediaType, RawContent, TextContent +from .models import Content, MediaType -def create_raw_content( - learning_package_id: int, - /, - data_bytes: bytes, - mime_type: str, - created: datetime, - hash_digest: str | None = None, -) -> RawContent: - """ - Create a new RawContent instance and persist it to storage. - """ - hash_digest = hash_digest or create_hash_digest(data_bytes) - - raw_content = RawContent.objects.create( - learning_package_id=learning_package_id, - media_type_id=get_or_create_media_type_id(mime_type), - hash_digest=hash_digest, - size=len(data_bytes), - created=created, - ) - raw_content.file.save( - f"{raw_content.learning_package.uuid}/{hash_digest}", - ContentFile(data_bytes), - ) - return raw_content - - -def create_text_from_raw_content(raw_content: RawContent, encoding="utf-8-sig") -> TextContent: - """ - Create a new TextContent instance for the given RawContent. - """ - text = codecs.decode(raw_content.file.open().read(), encoding) - return TextContent.objects.create( - raw_content=raw_content, - text=text, - length=len(text), - ) - - -@lru_cache(maxsize=128) -def get_or_create_media_type_id(mime_type: str) -> int: +def get_or_create_media_type(mime_type: str) -> MediaType: """ Return the MediaType.id for the desired mime_type string. @@ -68,10 +26,12 @@ def get_or_create_media_type_id(mime_type: str) -> int: the different XBlocks that will be installed in different server instances, each of which will use their own MediaType. - This will typically only be called when create_raw_content is calling it to - lookup the media_type_id it should use for a new RawContent. If you already - have a RawContent instance, it makes much more sense to access its - media_type relation. + Caching Warning: Be careful about putting any caching decorator around this + function (e.g. ``lru_cache``). It's possible that incorrect cache values + could leak out in the event of a rollback–e.g. new types are introduced in + a large import transaction which later fails. You can safely cache the + results that come back from this function with a local dict in your import + process instead. """ if "+" in mime_type: base, suffix = mime_type.split("+") @@ -80,69 +40,119 @@ def get_or_create_media_type_id(mime_type: str) -> int: suffix = "" main_type, sub_type = base.split("/") - mt, _created = MediaType.objects.get_or_create( + media_type, _created = MediaType.objects.get_or_create( type=main_type, sub_type=sub_type, suffix=suffix, ) - return mt.id + return media_type -def get_or_create_raw_content( +def get_content(content_id: int, /) -> Content: + """ + Get a single Content object by its ID. + + Content is always attached to something when it's created, like to a + ComponentVersion. That means the "right" way to access a Content is almost + always going to be via those relations and not via this function. But I + include this function anyway because it's tiny to write and it's better than + someone using a get_or_create_* function when they really just want to get. + """ + return Content.objects.get(id=content_id) + + +def get_or_create_text_content( learning_package_id: int, + media_type_id: int, /, - data_bytes: bytes, - mime_type: str, + text: str, created: datetime, - hash_digest: str | None = None, -) -> tuple[RawContent, bool]: + create_file: bool = False, +) -> Content: """ - Get the RawContent in the given learning package with the specified data, - or create it if it doesn't exist. + Get or create a Content entry with text data stored in the database. + + Use this when you want to create relatively small chunks of text that need + to be accessed quickly, especially if you're pulling back multiple rows at + once. For example, this is the function to call when storing OLX for a + component XBlock like a ProblemBlock. + + This function will *always* create a text entry in the database. In addition + to this, if you specify ``create_file=True``, it will also save a copy of + that text data to the file storage backend. This is useful if we want to let + that file be downloadable by browsers in the LMS at some point. + + If you want to create a large text file, or want to create a text file that + doesn't need to be stored in the database, call ``create_file_content`` + instead of this function. """ - hash_digest = hash_digest or create_hash_digest(data_bytes) - try: - raw_content = RawContent.objects.get( - learning_package_id=learning_package_id, hash_digest=hash_digest - ) - was_created = False - except RawContent.DoesNotExist: - raw_content = create_raw_content( - learning_package_id, data_bytes, mime_type, created, hash_digest - ) - was_created = True - - return raw_content, was_created - - -def get_or_create_text_content_from_bytes( + text_as_bytes = text.encode('utf-8') + hash_digest = create_hash_digest(text_as_bytes) + + with atomic(): + try: + content = Content.objects.get( + learning_package_id=learning_package_id, + media_type_id=media_type_id, + hash_digest=hash_digest, + ) + except Content.DoesNotExist: + content = Content( + learning_package_id=learning_package_id, + media_type_id=media_type_id, + hash_digest=hash_digest, + created=created, + size=len(text_as_bytes), + text=text, + has_file=create_file, + ) + content.full_clean() + content.save() + + if create_file: + content.write_file(ContentFile(text_as_bytes)) + + return content + + +def get_or_create_file_content( learning_package_id: int, + media_type_id: int, /, - data_bytes: bytes, - mime_type: str, + data: bytes, created: datetime, - hash_digest: str | None = None, - encoding: str = "utf-8-sig", -): +) -> Content: """ - Get the TextContent in the given learning package with the specified data, - or create it if it doesn't exist. + Get or create a Content with data stored in a file storage backend. + + Use this function to store non-text data, large data, or data where low + latency access is not necessary. Also use this function (or + ``get_or_create_text_content`` with ``create_file=True``) to store any + Content that you want to be downloadable by browsers in the LMS, since the + static asset serving system will only work with file-backed Content. """ + hash_digest = create_hash_digest(data) with atomic(): - raw_content, rc_created = get_or_create_raw_content( - learning_package_id, data_bytes, mime_type, created, hash_digest - ) - if rc_created or not hasattr(raw_content, "text_content"): - text = codecs.decode(data_bytes, encoding) - text_content = TextContent.objects.create( - raw_content=raw_content, - text=text, - length=len(text), + try: + content = Content.objects.get( + learning_package_id=learning_package_id, + media_type_id=media_type_id, + hash_digest=hash_digest, ) - tc_created = True - else: - text_content = raw_content.text_content - tc_created = False + except Content.DoesNotExist: + content = Content( + learning_package_id=learning_package_id, + media_type_id=media_type_id, + hash_digest=hash_digest, + created=created, + size=len(data), + text=None, + has_file=True, + ) + content.full_clean() + content.save() + + content.write_file(ContentFile(data)) - return (text_content, tc_created) + return content diff --git a/openedx_learning/core/contents/migrations/0001_initial.py b/openedx_learning/core/contents/migrations/0001_initial.py index d1099466..1abf65ad 100644 --- a/openedx_learning/core/contents/migrations/0001_initial.py +++ b/openedx_learning/core/contents/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.23 on 2024-01-22 00:38 +# Generated by Django 3.2.23 on 2024-02-06 03:23 import django.core.validators import django.db.models.deletion @@ -18,56 +18,49 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name='MediaType', - fields=[ - ('id', models.AutoField(primary_key=True, serialize=False)), - ('type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), - ('sub_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), - ('suffix', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), - ], - ), - migrations.CreateModel( - name='RawContent', + name='Content', fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('hash_digest', models.CharField(editable=False, max_length=40)), ('size', models.PositiveBigIntegerField(validators=[django.core.validators.MaxValueValidator(50000000)])), + ('hash_digest', models.CharField(editable=False, max_length=40)), + ('has_file', models.BooleanField()), + ('text', openedx_learning.lib.fields.MultiCollationTextField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=50000, null=True)), ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), - ('file', models.FileField(null=True, upload_to='')), - ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), - ('media_type', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='oel_contents.mediatype')), ], options={ - 'verbose_name': 'Raw Content', - 'verbose_name_plural': 'Raw Contents', + 'verbose_name': 'Content', + 'verbose_name_plural': 'Contents', }, ), migrations.CreateModel( - name='TextContent', + name='MediaType', fields=[ - ('raw_content', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, primary_key=True, related_name='text_content', serialize=False, to='oel_contents.rawcontent')), - ('text', openedx_learning.lib.fields.MultiCollationTextField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=100000)), - ('length', models.PositiveIntegerField()), + ('id', models.AutoField(primary_key=True, serialize=False)), + ('type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), + ('sub_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), + ('suffix', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), ], ), migrations.AddConstraint( model_name='mediatype', constraint=models.UniqueConstraint(fields=('type', 'sub_type', 'suffix'), name='oel_contents_uniq_t_st_sfx'), ), - migrations.AddIndex( - model_name='rawcontent', - index=models.Index(fields=['learning_package', 'media_type'], name='oel_content_idx_lp_media_type'), + migrations.AddField( + model_name='content', + name='learning_package', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage'), ), - migrations.AddIndex( - model_name='rawcontent', - index=models.Index(fields=['learning_package', '-size'], name='oel_content_idx_lp_rsize'), + migrations.AddField( + model_name='content', + name='media_type', + field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='oel_contents.mediatype'), ), migrations.AddIndex( - model_name='rawcontent', - index=models.Index(fields=['learning_package', '-created'], name='oel_content_idx_lp_rcreated'), + model_name='content', + index=models.Index(fields=['learning_package', '-size'], name='oel_content_idx_lp_rsize'), ), migrations.AddConstraint( - model_name='rawcontent', + model_name='content', constraint=models.UniqueConstraint(fields=('learning_package', 'media_type', 'hash_digest'), name='oel_content_uniq_lc_media_type_hash_digest'), ), ] diff --git a/openedx_learning/core/contents/models.py b/openedx_learning/core/contents/models.py index d90c2abd..58565c07 100644 --- a/openedx_learning/core/contents/models.py +++ b/openedx_learning/core/contents/models.py @@ -3,42 +3,53 @@ the simplest building blocks to store data with. They need to be composed into more intelligent data models to be useful. """ +from __future__ import annotations + from functools import cached_property -from django.conf import settings -from django.core.files.storage import default_storage +from django.core.exceptions import ValidationError +from django.core.files.base import File +from django.core.files.storage import Storage, default_storage from django.core.validators import MaxValueValidator from django.db import models -from openedx_learning.lib.fields import ( - MultiCollationTextField, - case_insensitive_char_field, - hash_field, - manual_date_time_field, -) - +from ...lib.fields import MultiCollationTextField, case_insensitive_char_field, hash_field, manual_date_time_field +from ...lib.managers import WithRelationsManager from ..publishing.models import LearningPackage +def get_storage() -> Storage: + """ + Return the Storage instance for our Content file persistence. + + For right now, we're still only storing inline text and not static assets in + production, so just return the default_storage. We're also going through a + transition between Django 3.2 -> 4.2, where storage configuration has moved. + + Make this work properly as part of adding support for static assets. + """ + return default_storage + + class MediaType(models.Model): """ - Stores Media types for use by RawContent models. + Stores Media types for use by Content models. This is the same as MIME types (the IANA renamed MIME Types to Media Types). - We don't pre-populate this table, so APIs that add RawContent must ensure - that the desired Media Type exists. + We don't pre-populate this table, so APIs that add Content must ensure that + the desired Media Type exists. Media types are written as {type}/{sub_type}+{suffix}, where suffixes are - seldom used. + seldom used. Examples: * application/json * text/css * image/svg+xml * application/vnd.openedx.xblock.v1.problem+xml - We have this as a separate model (instead of a field on RawContent) because: + We have this as a separate model (instead of a field on Content) because: - 1. We can save a lot on storage and indexing for RawContent if we're just + 1. We can save a lot on storage and indexing for Content if we're just storing foreign key references there, rather than the entire content string to be indexed. This is especially relevant for our (long) custom types like "application/vnd.openedx.xblock.v1.problem+xml". @@ -46,9 +57,9 @@ class MediaType(models.Model): "application/javascript". Also, we will be using a fair number of "vnd." style of custom content types, and we may want the flexibility of changing that without having to worry about migrating millions of rows of - RawContent. + Content. """ - # We're going to have many foreign key references from RawContent into this + # We're going to have many foreign key references from Content into this # model, and we don't need to store those as 8-byte BigAutoField, as is the # default for this app. It's likely that a SmallAutoField would work, but I # can just barely imagine using more than 32K Media types if we have a bunch @@ -69,10 +80,9 @@ class MediaType(models.Model): # always written in lowercase. sub_type = case_insensitive_char_field(max_length=127, blank=False, null=False) - # Suffix, usually just "xml" (e.g. "image/svg+xml"). Usually blank. I - # couldn't find an RFC description of the length limit, and 127 is probably - # excessive. But this table should be small enough where it doesn't really - # matter. + # Suffix, like "xml" (e.g. "image/svg+xml"). Usually blank. I couldn't find + # an RFC description of the length limit, and 127 is probably excessive. But + # this table should be small enough where it doesn't really matter. suffix = case_insensitive_char_field(max_length=127, blank=True, null=False) class Meta: @@ -95,92 +105,222 @@ def __str__(self): return base -class RawContent(models.Model): # type: ignore[django-manager-missing] +class Content(models.Model): """ - This is the most basic piece of raw content data, with no version metadata. - - RawContent stores data using the "file" field. This data is not - auto-normalized in any way, meaning that pieces of content that are - semantically equivalent (e.g. differently spaced/sorted JSON) may result in - new entries. This model is intentionally ignorant of what these things mean, - because it expects supplemental data models to build on top of it. - - Two RawContent instances _can_ have the same hash_digest if they are of - different MIME types. For instance, an empty text file and an empty SRT file - will both hash the same way, but be considered different entities. - - The other fields on RawContent are for data that is intrinsic to the file - data itself (e.g. the size). Any smart parsing of the contents into more - structured metadata should happen in other models that hang off of - RawContent. - - RawContent models are not versioned in any way. The concept of versioning - only exists at a higher level. - - RawContent is optimized for cheap storage, not low latency. It stores - content in a FileField. If you need faster text access across multiple rows, - add a TextContent entry that corresponds to the relevant RawContent. - - If you need to transform this RawContent into more structured data for your - application, create a model with a OneToOneField(primary_key=True) - relationship to RawContent. Just remember that *you should always create the - RawContent entry* first, to ensure content is always exportable, even if - your app goes away in the future. - - Operational Notes - ----------------- - - RawContent stores data using a FileField, which you'd typically want to back - with something like S3 when running in a production environment. That file - storage backend will not support rollback, meaning that if you start the - import process and things break halfway through, the RawContent model rows - will be rolled back, but the uploaded files will still remain on your file - storage system. The files are based on a hash of the contents though, so it - should still work later on when the import succeeds (it'll just have to - upload fewer files). - - TODO: Write about cleaning up accidental uploads of really large/unnecessary - files. Pruning of unreferenced (never published, or currently unused) - component versions and assets, and how that ties in? + This is the most primitive piece of content data. + + This model serves to lookup, de-duplicate, and store text and files. A piece + of Content is identified purely by its data, the media type, and the + LearningPackage it is associated with. It has no version or file name + metadata associated with it. It exists to be a dumb blob of data that higher + level models like ComponentVersions can assemble together. + + # In-model Text vs. File + + That being said, the Content model does have some complexity to accomodate + different access patterns that we have in our app. In particular, it can + store data in two ways: the ``text`` field and a file (``has_file=True``) + A Content object must use at least one of these methods, but can use both if + it's appropriate. + + Use the ``text`` field when: + * the content is a relatively small (< 50K, usually much less) piece of text + * you want to do be able to query up update across many rows at once + * low, predictable latency is important + + Use file storage when: + * the content is large, or not text-based + * you want to be able to serve the file content directly to the browser + + The high level tradeoff is that ``text`` will give you faster access, and + file storage will give you a much more affordable and scalable backend. The + backend used for files will also eventually allow direct browser download + access, whereas the ``text`` field will not. But again, you can use both at + the same time if needed. + + # Association with a LearningPackage + + Content is associated with a specific LearningPackage. Doing so allows us to + more easily query for how much storge space a specific LearningPackage + (likely a library) is using, and to clean up unused data. + + When we get to borrowing Content across LearningPackages, it's likely that + we will want to copy them. That way, even if the originating LearningPackage + is deleted, it won't break other LearningPackages that are making use if it. + + # Media Types, and file duplication + + Content is almost 1:1 with the files that it pushes to a storage backend, + but not quite. The file locations are generated purely as a product of the + LearningPackage UUID and the Content's ``hash_digest``, but Content also + takes into account the ``media_type``. + + For example, say we had a Content with the following data: + + ["hello", "world"] + + That is legal syntax for both JSON and YAML. If you want to attach some + YAML-specific metadata in a new model, you could make it 1:1 with the + Content that matched the "application/yaml" media type. The YAML and JSON + versions of this data would be two separate Content rows that would share + the same ``hash_digest`` value. If they both stored a file, they would be + pointing to the same file location. If they only used the ``text`` field, + then that value would be duplicated across the two separate Content rows. + + The alternative would have been to associate media types at the level where + this data was being added to a ComponentVersion, but that would have added + more complexity. Right now, you could make an ImageContent 1:1 model that + analyzed images and created metatdata entries for them (dimensions, GPS) + without having to understand how ComponentVerisons work. + + This is definitely an edge case, and it's likely the only time collisions + like this will happen in practice is with blank files. It also means that + using this table to measure disk usage may be slightly inaccurate when used + in a LearningPackage with collisions–though we expect to use numbers like + that mostly to get a broad sense of usage and look for major outliers, + rather than for byte-level accuracy (it wouldn't account for the non-trivial + indexing storage costs either). + + # Immutability + + From the outside, Content should appear immutable. Since the Content is + looked up by a hash of its data, a change in the data means that we should + look up the hash value of that new data and create a new Content if we don't + find a match. + + That being said, the Content model has different ways of storing that data, + and that is mutable. We could decide that a certain type of Content should + be optimized to store its text in the table. Or that a content type that we + had previously only stored as text now also needs to be stored on in the + file storage backend so that it can be made available to be downloaded. + These operations would be done as data migrations. + + # Extensibility + + Third-party apps are encouraged to create models that have a OneToOneField + relationship with Content. For instance, an ImageContent model might join + 1:1 with all Content that has image/* media types, and provide additional + metadata for that data. """ - - # 50 MB is our current limit, based on the current Open edX Studio file - # upload size limit. + # Max size of the file. MAX_FILE_SIZE = 50_000_000 - learning_package = models.ForeignKey(LearningPackage, on_delete=models.CASCADE) + # 50K is our limit for text data, like OLX. This means 50K *characters*, + # not bytes. Since UTF-8 encodes characters using as many as 4 bytes, this + # could be as much as 200K of data if we had nothing but emojis. + MAX_TEXT_LENGTH = 50_000 - # This hash value may be calculated using create_hash_digest from the - # openedx.lib.fields module. - hash_digest = hash_field() + objects: models.Manager[Content] = WithRelationsManager('media_type') + + learning_package = models.ForeignKey(LearningPackage, on_delete=models.CASCADE) # What is the Media type (a.k.a. MIME type) of this data? media_type = models.ForeignKey(MediaType, on_delete=models.PROTECT) - # This is the size of the raw data file in bytes. This can be different than - # the character length, since UTF-8 encoding can use anywhere between 1-4 - # bytes to represent any given character. + # This is the size of the file in bytes. This can be different than the + # character length of a text file, since UTF-8 encoding can use anywhere + # between 1-4 bytes to represent any given character. size = models.PositiveBigIntegerField( validators=[MaxValueValidator(MAX_FILE_SIZE)], ) - # This should be manually set so that multiple RawContent rows being set in - # the same transaction are created with the same timestamp. The timestamp - # should be UTC. - created = manual_date_time_field() + # This hash value may be calculated using create_hash_digest from the + # openedx.lib.fields module. When storing text, we hash the UTF-8 + # encoding of that text value, regardless of whether we also write it to a + # file or not. When storing just a file, we hash the bytes in the file. + hash_digest = hash_field() - # All content for the LearningPackage should be stored in files. See model - # docstring for more details on how to store this data in supplementary data - # models that offer better latency guarantees. - file = models.FileField( + # Do we have file data stored for this Content in our file storage backend? + has_file = models.BooleanField() + + # The ``text`` field contains the text representation of the Content, if + # it is available. A blank value means means that we are storing text for + # this Content, and that text happens to be an empty string. A null value + # here means that we are not storing any text here, and the Content exists + # only in file form. It is an error for ``text`` to be None and ``has_file`` + # to be False, since that would mean we haven't stored data anywhere at all. + # + # We annotate this because mypy doesn't recognize that ``text`` should be + # nullable when using MultiCollationTextField, but does the right thing for + # TextField. For more info, see: + # https://github.com/openedx/openedx-learning/issues/152 + text: models.TextField[str | None, str | None] = MultiCollationTextField( + blank=True, null=True, - storage=settings.OPENEDX_LEARNING.get("STORAGE", default_storage), # type: ignore + max_length=MAX_TEXT_LENGTH, + # We don't really expect to ever sort by the text column, but we may + # want to do case-insensitive searches, so it's useful to have a case + # and accent insensitive collation. + db_collations={ + "sqlite": "NOCASE", + "mysql": "utf8mb4_unicode_ci", + } ) + # This should be manually set so that multiple Content rows being set in + # the same transaction are created with the same timestamp. The timestamp + # should be UTC. + created = manual_date_time_field() + @cached_property - def mime_type(self): + def mime_type(self) -> str: + """ + The IANA media type (a.k.a. MIME type) of the Content, in string form. + + MIME types reference: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types + """ return str(self.media_type) + def file_path(self): + """ + Path at which this content is stored (or would be stored). + + This path is relative to configured storage root. + """ + return f"{self.learning_package.uuid}/{self.hash_digest}" + + def write_file(self, file: File) -> None: + """ + Write file contents to the file storage backend. + """ + storage = get_storage() + file_path = self.file_path() + + # There are two reasons why a file might already exist even if the the + # Content row is new: + # + # 1. We tried adding the file earlier, but an error rolled back the + # state of the database. The file storage system isn't covered by any + # sort of transaction semantics, so it won't get rolled back. + # + # 2. The Content is of a different MediaType. The same exact bytes can + # be two logically separate Content entries if they are different file + # types. This lets other models add data to Content via 1:1 relations by + # ContentType (e.g. all SRT files). This is definitely an edge case. + if not storage.exists(file_path): + storage.save(file_path, file) + + def file_url(self) -> str: + """ + This will sometimes be a time-limited signed URL. + """ + return get_storage().url(self.file_path()) + + def clean(self): + """ + Make sure we're actually storing *something*. + + If this Content has neither a file or text data associated with it, + it's in a broken/useless state and shouldn't be saved. + """ + if (not self.has_file) and (self.text is None): + raise ValidationError( + f"Content {self.pk} with hash {self.hash_digest} must either " + "set a string value for 'text', or it must set has_file=True " + "(or both)." + ) + class Meta: constraints = [ # Make sure we don't store duplicates of this raw data within the @@ -195,71 +335,12 @@ class Meta: ), ] indexes = [ - # LearningPackage Media type Index: - # * Break down Content counts by type/subtype with in a - # LearningPackage. - # * Find all the Content in a LearningPackage that matches a - # certain MIME type (e.g. "image/png", "application/pdf". - models.Index( - fields=["learning_package", "media_type"], - name="oel_content_idx_lp_media_type", - ), # LearningPackage (reverse) Size Index: - # * Find largest Content in a LearningPackage. - # * Find the sum of Content size for a given LearningPackage. + # * Find the largest Content entries. models.Index( fields=["learning_package", "-size"], name="oel_content_idx_lp_rsize", ), - # LearningPackage (reverse) Created Index: - # * Find most recently added Content. - models.Index( - fields=["learning_package", "-created"], - name="oel_content_idx_lp_rcreated", - ), ] - verbose_name = "Raw Content" - verbose_name_plural = "Raw Contents" - - -class TextContent(models.Model): - """ - TextContent supplements RawContent to give an in-table text copy. - - This model exists so that we can have lower-latency access to this data, - particularly if we're pulling back multiple rows at once. - - Apps are encouraged to create their own data models that further extend this - one with a more intelligent, parsed data model. For example, individual - XBlocks might parse the OLX in this model into separate data models for - VideoBlock, ProblemBlock, etc. You can do this by making your supplementary - model linked to this model via OneToOneField with primary_key=True. - - The reason this is built directly into the Learning Core data model is - because we want to be able to easily access and browse this data even if the - app-extended models get deleted (e.g. if they are deprecated and removed). - """ - - # 100K is our limit for text data, like OLX. This means 100K *characters*, - # not bytes. Since UTF-8 encodes characters using as many as 4 bytes, this - # could be as much as 400K of data if we had nothing but emojis. - MAX_TEXT_LENGTH = 100_000 - - raw_content = models.OneToOneField( - RawContent, - on_delete=models.CASCADE, - primary_key=True, - related_name="text_content", - ) - text = MultiCollationTextField( - blank=True, - max_length=MAX_TEXT_LENGTH, - # We don't really expect to ever sort by the text column, but we may - # want to do case-insensitive searches, so it's useful to have a case - # and accent insensitive collation. - db_collations={ - "sqlite": "NOCASE", - "mysql": "utf8mb4_unicode_ci", - } - ) - length = models.PositiveIntegerField(null=False) + verbose_name = "Content" + verbose_name_plural = "Contents" diff --git a/openedx_learning/core/publishing/migrations/0001_initial.py b/openedx_learning/core/publishing/migrations/0001_initial.py index 0a4808f4..6ffb8b85 100644 --- a/openedx_learning/core/publishing/migrations/0001_initial.py +++ b/openedx_learning/core/publishing/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.23 on 2024-01-22 00:37 +# Generated by Django 3.2.23 on 2024-02-06 00:36 import uuid @@ -23,7 +23,7 @@ class Migration(migrations.Migration): migrations.CreateModel( name='LearningPackage', fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('id', models.AutoField(primary_key=True, serialize=False)), ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True, verbose_name='UUID')), ('key', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_bin', 'sqlite': 'BINARY'}, max_length=500)), ('title', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=500)), diff --git a/openedx_learning/core/publishing/models.py b/openedx_learning/core/publishing/models.py index 9082e8f2..3f668a9f 100644 --- a/openedx_learning/core/publishing/models.py +++ b/openedx_learning/core/publishing/models.py @@ -30,9 +30,21 @@ class LearningPackage(models.Model): # type: ignore[django-manager-missing] Each PublishableEntity belongs to exactly one LearningPackage. """ + # Explictly declare a 4-byte ID instead of using the app-default 8-byte ID. + # We do not expect to have more than 2 billion LearningPackages on a given + # site. Furthermore, many, many things have foreign keys to this model and + # uniqueness indexes on those foreign keys + their own fields, so the 4 + # bytes saved will add up over time. + id = models.AutoField(primary_key=True) + uuid = immutable_uuid_field() key = key_field() title = case_insensitive_char_field(max_length=500, blank=False) + + # TODO: We should probably defer this field, since many things pull back + # LearningPackage as select_related. Usually those relations only care about + # the UUID and key, so maybe it makes sense to separate the model at some + # point. description = MultiCollationTextField( blank=True, null=False, @@ -355,6 +367,21 @@ class PublishLog(models.Model): Open question: Empty publishes are allowed at this time, and might be useful for "fake" publishes that are necessary to invoke other post-publish actions. It's not clear at this point how useful this will actually be. + + The absence of a ``version_num`` field in this model is intentional, because + having one would potentially cause write contention/locking issues when + there are many people working on different entities in a very large library. + We already see some contention issues occuring in ModuleStore for courses, + and we want to support Libraries that are far larger. + + If you need a LearningPackage-wide indicator for version and the only thing + you care about is "has *something* changed?", you can make a foreign key to + the most recent PublishLog, or use the most recent PublishLog's primary key. + This should be monotonically increasing, though there will be large gaps in + values, e.g. (5, 190, 1291, etc.). Be warned that this value will not port + across sites. If you need site-portability, the UUIDs for this model are a + safer bet, though there's a lot about import/export that we haven't fully + mapped out yet. """ uuid = immutable_uuid_field() diff --git a/openedx_learning/lib/fields.py b/openedx_learning/lib/fields.py index 900e590f..95e41bef 100644 --- a/openedx_learning/lib/fields.py +++ b/openedx_learning/lib/fields.py @@ -20,6 +20,18 @@ def create_hash_digest(data_bytes: bytes) -> str: + """ + Create a 40-byte, lower-case hex string representation of a hash digest. + + The hash digest itself is 20-bytes using BLAKE2b. + + DON'T JUST MODIFY THIS HASH BEHAVIOR!!! We use hashing for de-duplication + purposes. If this hash function ever changes, that deduplication will fail + because the hashing behavior won't match what's already in the database. + + If we want to change this representation one day, we should create a new + function for that and do the appropriate data migration. + """ return hashlib.blake2b(data_bytes, digest_size=20).hexdigest() diff --git a/openedx_learning/lib/managers.py b/openedx_learning/lib/managers.py index ed700072..b21f0a6b 100644 --- a/openedx_learning/lib/managers.py +++ b/openedx_learning/lib/managers.py @@ -13,8 +13,12 @@ class WithRelationsManager(models.Manager): into some of its relations and you want to avoid unnecessary extra database calls. - Use this to create a distinctly named manager on your model class, instead - of overwriting ``objects``. So for example:: + You can override the default ``objects`` manager with this one if you have + a model that should basically always called with a ``select_related``. For + example, if you have a small lookup type-model that is frequently accessed. + + For more complex joins, use this class to create a distinctly named manager + on your model class, instead of overwriting ``objects``. So for example:: class Component(models.Model): with_publishing_relations = WithRelationsManager( diff --git a/test_settings.py b/test_settings.py index 53548fb4..67e87fc9 100644 --- a/test_settings.py +++ b/test_settings.py @@ -60,13 +60,6 @@ def root(*args): USE_TZ = True -# openedx-learning required configuration -OPENEDX_LEARNING = { - # Custom file storage, though this is better done through Django's - # STORAGES setting in Django >= 4.2 - "STORAGE": None, -} - MEDIA_ROOT = root("test_media") ######################### Django Rest Framework ######################## diff --git a/tests/openedx_learning/core/components/test_api.py b/tests/openedx_learning/core/components/test_api.py index b1d46929..cd5543ed 100644 --- a/tests/openedx_learning/core/components/test_api.py +++ b/tests/openedx_learning/core/components/test_api.py @@ -6,38 +6,48 @@ from django.core.exceptions import ObjectDoesNotExist from openedx_learning.core.components import api as components_api -from openedx_learning.core.components.models import Component +from openedx_learning.core.components.models import Component, ComponentType from openedx_learning.core.contents import api as contents_api +from openedx_learning.core.contents.models import MediaType from openedx_learning.core.publishing import api as publishing_api from openedx_learning.core.publishing.models import LearningPackage from openedx_learning.lib.test_utils import TestCase -class PerformanceTestCase(TestCase): +class ComponentTestCase(TestCase): """ - Performance related tests for Components. - - These are mostly to ensure that when Components are fetched, they're fetched - with a select_related on the most commonly queried things; draft and - published version metadata. + Base-class for setting up commonly used test data. """ learning_package: LearningPackage now: datetime + # XBlock Component Types + html_type: ComponentType + problem_type: ComponentType + video_type: ComponentType + @classmethod def setUpTestData(cls) -> None: - """ - Initialize our base learning package. - - We don't actually need to add content to the ComponentVersions, since - for this we only care about the metadata on Compnents, their versions, - and the associated draft/publish status. - """ cls.learning_package = publishing_api.create_learning_package( - "components.TestPerformance", - "Learning Package for Testing Performance (measured by DB queries)", + key="ComponentTestCase-test-key", + title="Components Test Case Learning Package", ) cls.now = datetime(2023, 5, 8, tzinfo=timezone.utc) + cls.html_type = components_api.get_or_create_component_type("xblock.v1", "html") + cls.problem_type = components_api.get_or_create_component_type("xblock.v1", "problem") + cls.video_type = components_api.get_or_create_component_type("xblock.v1", "video") + + +class PerformanceTestCase(ComponentTestCase): + """ + Performance related tests for Components. + + These are mostly to ensure that when Components are fetched, they're fetched + with a select_related on the most commonly queried things; draft and + published version metadata. + """ + learning_package: LearningPackage + now: datetime def test_component_num_queries(self) -> None: """ @@ -45,8 +55,7 @@ def test_component_num_queries(self) -> None: """ component, _version = components_api.create_component_and_version( self.learning_package.id, - namespace="xblock.v1", - type_name="problem", + component_type=self.problem_type, local_key="Query Counting", title="Querying Counting Problem", created=self.now, @@ -66,12 +75,10 @@ def test_component_num_queries(self) -> None: assert draft.title == published.title -class GetComponentsTestCase(TestCase): +class GetComponentsTestCase(ComponentTestCase): """ Test grabbing a queryset of Components. """ - learning_package: LearningPackage - now: datetime published_problem: Component published_html: Component unpublished_problem: Component @@ -87,27 +94,21 @@ def setUpTestData(cls) -> None: for this we only care about the metadata on Compnents, their versions, and the associated draft/publish status. """ - cls.learning_package = publishing_api.create_learning_package( - "components.TestGetComponents", - "Learning Package for Testing Getting & Filtering Components", - ) - cls.now = datetime(2023, 5, 8, tzinfo=timezone.utc) + super().setUpTestData() + v2_problem_type = components_api.get_or_create_component_type("xblock.v2", "problem") - # Components we're publishing... cls.published_problem, _version = components_api.create_component_and_version( cls.learning_package.id, - namespace="xblock.v2", - type_name="problem", - local_key="published_problem", + component_type=v2_problem_type, + local_key="pp_lk", title="Published Problem", created=cls.now, created_by=None, ) cls.published_html, _version = components_api.create_component_and_version( cls.learning_package.id, - namespace="xblock.v1", - type_name="html", - local_key="published_html", + component_type=cls.html_type, + local_key="ph_lk", title="Published HTML", created=cls.now, created_by=None, @@ -120,18 +121,16 @@ def setUpTestData(cls) -> None: # Components that exist only as Drafts cls.unpublished_problem, _version = components_api.create_component_and_version( cls.learning_package.id, - namespace="xblock.v2", - type_name="problem", - local_key="unpublished_problem", + component_type=v2_problem_type, + local_key="upp_lk", title="Unpublished Problem", created=cls.now, created_by=None, ) cls.unpublished_html, _version = components_api.create_component_and_version( cls.learning_package.id, - namespace="xblock.v1", - type_name="html", - local_key="unpublished_html", + component_type=cls.html_type, + local_key="uph_lk", title="Unpublished HTML", created=cls.now, created_by=None, @@ -141,9 +140,8 @@ def setUpTestData(cls) -> None: # Draft entry) cls.deleted_video, _version = components_api.create_component_and_version( cls.learning_package.id, - namespace="xblock.v1", - type_name="html", - local_key="deleted_video", + component_type=cls.video_type, + local_key="dv_lk", title="Deleted Video", created=cls.now, created_by=None, @@ -274,12 +272,10 @@ def test_published_title_filter(self): ] -class ComponentGetAndExistsTestCase(TestCase): +class ComponentGetAndExistsTestCase(ComponentTestCase): """ Test getting a Component by primary key or key string. """ - learning_package: LearningPackage - now: datetime problem: Component html: Component @@ -292,23 +288,18 @@ def setUpTestData(cls) -> None: for this we only care about the metadata on Compnents, their versions, and the associated draft/publish status. """ - cls.learning_package = publishing_api.create_learning_package( - "components.TestComponentGetAndExists", - "Learning Package for Testing Getting a Component", - ) - cls.now = datetime(2023, 5, 8, tzinfo=timezone.utc) + super().setUpTestData() + cls.problem = components_api.create_component( cls.learning_package.id, - namespace='xblock.v1', - type_name='problem', + component_type=cls.problem_type, local_key='my_component', created=cls.now, created_by=None, ) cls.html = components_api.create_component( cls.learning_package.id, - namespace='xblock.v1', - type_name='html', + component_type=cls.html_type, local_key='my_component', created=cls.now, created_by=None, @@ -349,29 +340,24 @@ def test_exists_by_key(self): ) -class CreateNewVersionsTestCase(TestCase): +class CreateNewVersionsTestCase(ComponentTestCase): """ Create new ComponentVersions in various ways. """ - learning_package: LearningPackage - now: datetime problem: Component + text_media_type: MediaType @classmethod def setUpTestData(cls) -> None: - cls.learning_package = publishing_api.create_learning_package( - "components.TestCreateNextVersion", - "Learning Package for Testing Next Version Creation", - ) - cls.now = datetime(2023, 5, 8, tzinfo=timezone.utc) + super().setUpTestData() cls.problem = components_api.create_component( cls.learning_package.id, - namespace='xblock.v1', - type_name='problem', + component_type=cls.problem_type, local_key='my_component', created=cls.now, created_by=None, ) + cls.text_media_type = contents_api.get_or_create_media_type("text/plain") def test_add(self): new_version = components_api.create_component_version( @@ -381,15 +367,15 @@ def test_add(self): created=self.now, created_by=None, ) - new_content, _created = contents_api.get_or_create_raw_content( + new_content = contents_api.get_or_create_text_content( self.learning_package.pk, - b"This is some data", - mime_type="text/plain", + self.text_media_type.id, + text="This is some data", created=self.now, ) - components_api.add_content_to_component_version( + components_api.create_component_version_content( new_version.pk, - raw_content_id=new_content.pk, + new_content.pk, key="hello.txt", learner_downloadable=False, ) @@ -399,26 +385,26 @@ def test_add(self): .get(publishable_entity_version__version_num=1) assert ( new_content == - new_version.raw_contents.get(componentversionrawcontent__key="hello.txt") + new_version.contents.get(componentversioncontent__key="hello.txt") ) def test_multiple_versions(self): - hello_content, _created = contents_api.get_or_create_text_content_from_bytes( + hello_content = contents_api.get_or_create_text_content( self.learning_package.id, - data_bytes="Hello World!".encode('utf-8'), - mime_type="text/plain", + self.text_media_type.id, + text="Hello World!", created=self.now, ) - goodbye_content, _created = contents_api.get_or_create_text_content_from_bytes( + goodbye_content = contents_api.get_or_create_text_content( self.learning_package.id, - data_bytes="Goodbye World!".encode('utf-8'), - mime_type="text/plain", + self.text_media_type.id, + text="Goodbye World!", created=self.now, ) - blank_content, _created = contents_api.get_or_create_text_content_from_bytes( + blank_content = contents_api.get_or_create_text_content( self.learning_package.id, - data_bytes="".encode('utf-8'), - mime_type="text/plain", + self.text_media_type.id, + text="", created=self.now, ) @@ -434,19 +420,17 @@ def test_multiple_versions(self): ) assert version_1.version_num == 1 assert version_1.title == "Problem Version 1" - version_1_contents = list(version_1.raw_contents.all()) + version_1_contents = list(version_1.contents.all()) assert len(version_1_contents) == 2 assert ( hello_content == - version_1.raw_contents - .get(componentversionrawcontent__key="hello.txt") - .text_content + version_1.contents + .get(componentversioncontent__key="hello.txt") ) assert ( goodbye_content == - version_1.raw_contents - .get(componentversionrawcontent__key="goodbye.txt") - .text_content + version_1.contents + .get(componentversioncontent__key="goodbye.txt") ) # This should keep the old value for goodbye.txt, add blank.txt, and set @@ -461,24 +445,21 @@ def test_multiple_versions(self): created=self.now, ) assert version_2.version_num == 2 - assert version_2.raw_contents.count() == 3 + assert version_2.contents.count() == 3 assert ( blank_content == - version_2.raw_contents - .get(componentversionrawcontent__key="hello.txt") - .text_content + version_2.contents + .get(componentversioncontent__key="hello.txt") ) assert ( goodbye_content == - version_2.raw_contents - .get(componentversionrawcontent__key="goodbye.txt") - .text_content + version_2.contents + .get(componentversioncontent__key="goodbye.txt") ) assert ( blank_content == - version_2.raw_contents - .get(componentversionrawcontent__key="blank.txt") - .text_content + version_2.contents + .get(componentversioncontent__key="blank.txt") ) # Now we're going to set "hello.txt" back to hello_content, but remove @@ -495,10 +476,9 @@ def test_multiple_versions(self): created=self.now, ) assert version_3.version_num == 3 - assert version_3.raw_contents.count() == 1 + assert version_3.contents.count() == 1 assert ( hello_content == - version_3.raw_contents - .get(componentversionrawcontent__key="hello.txt") - .text_content + version_3.contents + .get(componentversioncontent__key="hello.txt") ) diff --git a/tests/openedx_learning/core/components/test_models.py b/tests/openedx_learning/core/components/test_models.py index d4bb20e3..322f235b 100644 --- a/tests/openedx_learning/core/components/test_models.py +++ b/tests/openedx_learning/core/components/test_models.py @@ -3,7 +3,12 @@ """ from datetime import datetime, timezone -from openedx_learning.core.components.api import create_component_and_version, get_component +from openedx_learning.core.components.api import ( + create_component_and_version, + get_component, + get_or_create_component_type, +) +from openedx_learning.core.components.models import ComponentType from openedx_learning.core.publishing.api import LearningPackage, create_learning_package, publish_all_drafts from openedx_learning.lib.test_utils import TestCase @@ -14,6 +19,7 @@ class TestModelVersioningQueries(TestCase): """ learning_package: LearningPackage now: datetime + problem_type: ComponentType @classmethod def setUpTestData(cls) -> None: # Note: we must specify '-> None' to opt in to type checking @@ -22,12 +28,12 @@ def setUpTestData(cls) -> None: # Note: we must specify '-> None' to opt in to "Learning Package for Testing Component Versioning", ) cls.now = datetime(2023, 5, 8, tzinfo=timezone.utc) + cls.problem_type = get_or_create_component_type("xblock.v1", "problem") def test_latest_version(self) -> None: component, component_version = create_component_and_version( self.learning_package.id, - namespace="xblock.v1", - type_name="problem", + component_type=self.problem_type, local_key="monty_hall", title="Monty Hall Problem", created=self.now, diff --git a/tests/openedx_learning/core/contents/test_media_types.py b/tests/openedx_learning/core/contents/test_media_types.py index 96390cb4..3a7c93e1 100644 --- a/tests/openedx_learning/core/contents/test_media_types.py +++ b/tests/openedx_learning/core/contents/test_media_types.py @@ -5,30 +5,20 @@ from openedx_learning.lib.test_utils import TestCase -class MediaTypeCachingTestCase(TestCase): - """ - Test that our LRU caching is working as expected. - """ - def test_media_query_caching(self): - """Test MediaType queries auto-create and caching.""" - assert contents_api.get_or_create_media_type_id.cache_info().currsize == 0 +class MediaTypeTest(TestCase): + """Basic testing of our Media Types for Content""" - mime_type_str = "application/vnd.openedx.xblock.v1.problem+xml" - media_type_id = contents_api.get_or_create_media_type_id(mime_type_str) - - # Now it should be loaded in the cache - assert contents_api.get_or_create_media_type_id.cache_info().currsize == 1 - - # Second call pulls from cache instead of the database - with self.assertNumQueries(0): - # Should also return the same thing it did last time. - assert media_type_id == contents_api.get_or_create_media_type_id(mime_type_str) - - def test_media_query_caching_reset(self): + def test_get_or_create_dedupe(self): """ - Test that setUp/tearDown reset the get_media_type_id LRU cache. - - This test method's *must* execute after test_media_query_caching to be - meaningful (they execute in string sort order). + Make sure we're not creating redundant rows for the same media type. """ - assert contents_api.get_or_create_media_type_id.cache_info().currsize == 0 + # The first time, a row is created for "text/html" + text_media_type_1 = contents_api.get_or_create_media_type("text/plain") + + # This should return the previously created row. + text_media_type_2 = contents_api.get_or_create_media_type("text/plain") + assert text_media_type_1 == text_media_type_2 + + # This is a different type though... + svg_media_type = contents_api.get_or_create_media_type("image/svg+xml") + assert text_media_type_1 != svg_media_type