"""
Import OLX data into LORE.
"""
from __future__ import unicode_literals
from shutil import rmtree
import logging
from tempfile import mkdtemp
from os.path import join, exists
from os import listdir
from bs4 import BeautifulSoup
from archive import Archive, ArchiveException
from django.core.files.storage import default_storage
from django.db import transaction
from lxml import etree
from xbundle import XBundle, DESCRIPTOR_TAGS
from importer.tasks import populate_xanalytics_fields
from learningresources.api import (
create_course,
create_resource,
get_resources,
get_video_sub,
import_static_assets,
join_description_paths,
MissingTitle,
)
from learningresources.models import (
LearningResource,
StaticAsset,
course_asset_basepath
)
from search.utils import index_resources
log = logging.getLogger(__name__)
[docs]def import_course_from_file(filename, repo_id, user_id):
"""
Import OLX archive from .zip or tar.gz.
Imports from a file and then deletes that file.
A valid OLX archive has a single occurrence of the file course.xml in its
root directory, or no course.xml in its root and a single occurrence of
course.xml in one or more of the root directory's children.
Args:
filename (unicode): Path to archive file (zip or .tar.gz)
repo_id (int): Primary key of repository course belongs to
user_id (int): Primary key of user importing the course
Returns:
None
Raises:
ValueError: Unable to extract or read archive contents.
"""
tempdir = mkdtemp()
# HACK: Have to patch in "seekable" attribute for python3 and tar
# See: https://code.djangoproject.com/ticket/24963#ticket. Remove
# when updating to Django 1.9
def seekable():
"""Hacked seekable for django storage to work in python3"""
return True
try:
course_archive = default_storage.open(filename)
course_archive.seekable = seekable
try:
Archive(
course_archive
).extract(to_path=tempdir, method="safe")
except ArchiveException as ex:
log.debug("failed to extract: %s", ex)
log.exception('Archive exception occurred')
raise ValueError("Invalid OLX archive, unable to extract.")
course_imported = False
if "course.xml" in listdir(tempdir):
import_course_from_path(tempdir, repo_id, user_id)
course_imported = True
else:
for path in listdir(tempdir):
if exists(join(tempdir, path, 'course.xml')):
import_course_from_path(
join(tempdir, path), repo_id, user_id
)
course_imported = True
if course_imported is False:
raise ValueError("Invalid OLX archive, no courses found.")
finally:
default_storage.delete(filename)
rmtree(tempdir)
[docs]def import_course_from_path(path, repo_id, user_id):
"""
Import course from an OLX directory.
Args:
path (unicode): Path to extracted OLX tree
repo_id (int): Primary key of repository course belongs to
user_id (int): Primary key of Django user doing the import
Returns:
course (learningresources.Course)
"""
bundle = XBundle(
keep_urls=True, keep_studio_urls=True, preserve_url_name=True
)
bundle.import_from_directory(path)
static_dir = join(path, 'static')
with transaction.atomic():
course = import_course(bundle, repo_id, user_id, static_dir)
return course
[docs]def import_course(bundle, repo_id, user_id, static_dir):
"""
Import a course from an XBundle object.
Args:
bundle (xbundle.XBundle): Course as xbundle XML
repo_id (int): Primary key of repository course belongs to
user_id (int): Primary key of Django user doing the import
static_dir (unicode): location of static files
Returns:
learningresources.models.Course
"""
src = bundle.course
course = create_course(
org=src.attrib["org"],
repo_id=repo_id,
course_number=src.attrib["course"],
run=src.attrib["semester"],
user_id=user_id,
)
import_static_assets(course, static_dir)
import_children(course, src, None, '')
populate_xanalytics_fields.delay(course.id)
# This triggers a bulk indexing of all LearningResource instances
# for the course at once.
index_resources(
get_resources(repo_id).filter(
course__id=course.id).values_list("id", flat=True))
return course
[docs]def is_leaf_tag(tag):
"""
Should we look for resources within elements with this tag?
Args:
tag (unicode): Element tag
Returns:
bool: Whether tag is leaf tag
"""
return tag in {'video', 'html', 'problem', 'discussion'}
# pylint: disable=too-many-branches
[docs]def import_children(course, element, parent, parent_dpath):
"""
Create LearningResource instances for each element
of an XML tree.
Args:
course (learningresources.models.Course): Course
element (lxml.etree): XML element within xbundle
parent (learningresources.models.LearningResource):
Parent LearningResource
parent_dpath (unicode): parent description path
Returns:
None
"""
# pylint: disable=too-many-locals
title = element.attrib.get(
"display_name", MissingTitle.for_title_field)
desc_path = title
if desc_path == MissingTitle.for_title_field:
desc_path = MissingTitle.for_desc_path_field
mpath = etree.ElementTree(element).getpath(element)
dpath = join_description_paths(parent_dpath, desc_path)
url_name = element.attrib.get(
"url_name",
element.attrib.get("display_name", None)
)
resource = create_resource(
course=course, parent=parent, resource_type=element.tag,
title=title,
content_xml=etree.tostring(element),
mpath=mpath,
url_name=url_name,
dpath=dpath,
)
# temp variable to store static assets for bulk insert
static_assets_to_save = set()
target = "/static/"
if element.tag == "video": # pylint: disable=too-many-nested-blocks
subname = get_video_sub(element)
if subname != "":
assets = StaticAsset.objects.filter(
course__id=resource.course_id,
asset=course_asset_basepath(course, subname),
)
for asset in assets:
static_assets_to_save.add((resource, asset))
else:
# Recursively find all sub-elements, looking for anything which
# refers to /static/. Then make the association between the
# LearningResource and StaticAsset if the StaticAsset exists.
# This is like doing soup.findAll("a") and checking for whether
# "/static/" is in the href, which would work but also requires
# more code to check for link, img, iframe, script, and others,
# and within those, check for href or src existing.
soup = BeautifulSoup(etree.tostring(element), 'lxml')
for child in soup.findAll():
for _, val in child.attrs.items():
try:
if val.startswith(target):
path = val[len(target):]
try:
asset = StaticAsset.objects.get(
course__id=resource.course_id,
asset=course_asset_basepath(course, path),
)
static_assets_to_save.add((resource, asset))
except StaticAsset.DoesNotExist:
continue
except AttributeError:
continue # not a string
# Bulk insert of static assets
# Using this approach to avoid signals during the learning resource .save()
# Each signal triggers a reindex of the learning resource that is useless
# during import because all the learning resources are indexed in bulk at
# the end of the import anyway
ThroughModel = LearningResource.static_assets.through
ThroughModel.objects.bulk_create(
[
ThroughModel(
learningresource_id=resource.id,
staticasset_id=asset.id
)
for resource, asset in static_assets_to_save
]
)
# Try to protect against bad data, specifically <problem><problem>...
# imports. The two tags will still appear in content_xml but there will
# be only one resource for the outer one.
if not is_leaf_tag(element.tag):
for child in element.getchildren():
if child.tag in DESCRIPTOR_TAGS:
import_children(course, child, resource, dpath)