Source code for importer.api

"""
Import OLX data into LORE.
"""

from __future__ import unicode_literals

from shutil import rmtree
import logging
from tempfile import mkdtemp
from os.path import join, exists
from os import listdir

from bs4 import BeautifulSoup

from archive import Archive, ArchiveException
from django.core.files.storage import default_storage
from django.db import transaction
from lxml import etree
from xbundle import XBundle, DESCRIPTOR_TAGS

from importer.tasks import populate_xanalytics_fields
from learningresources.api import (
    create_course,
    create_resource,
    get_resources,
    get_video_sub,
    import_static_assets,
    join_description_paths,
    MissingTitle,
)
from learningresources.models import (
    LearningResource,
    StaticAsset,
    course_asset_basepath
)
from search.utils import index_resources

log = logging.getLogger(__name__)


[docs]def import_course_from_file(filename, repo_id, user_id):
    """
    Import OLX archive from .zip or tar.gz.

    Imports from a file and then deletes that file.
    A valid OLX archive has a single occurrence of the file course.xml in its
    root directory, or no course.xml in its root and a single occurrence of
    course.xml in one or more of the root directory's children.

    Args:
        filename (unicode): Path to archive file (zip or .tar.gz)
        repo_id (int): Primary key of repository course belongs to
        user_id (int): Primary key of user importing the course
    Returns:
        None
    Raises:
        ValueError: Unable to extract or read archive contents.
    """
    tempdir = mkdtemp()

    # HACK: Have to patch in "seekable" attribute for python3 and tar
    # See: https://code.djangoproject.com/ticket/24963#ticket. Remove
    # when updating to Django 1.9
    def seekable():
        """Hacked seekable for django storage to work in python3"""
        return True
    try:
        course_archive = default_storage.open(filename)
        course_archive.seekable = seekable
        try:
            Archive(
                course_archive
            ).extract(to_path=tempdir, method="safe")
        except ArchiveException as ex:
            log.debug("failed to extract: %s", ex)
            log.exception('Archive exception occurred')
            raise ValueError("Invalid OLX archive, unable to extract.")
        course_imported = False
        if "course.xml" in listdir(tempdir):
            import_course_from_path(tempdir, repo_id, user_id)
            course_imported = True
        else:
            for path in listdir(tempdir):
                if exists(join(tempdir, path, 'course.xml')):
                    import_course_from_path(
                        join(tempdir, path), repo_id, user_id
                    )
                    course_imported = True
        if course_imported is False:
            raise ValueError("Invalid OLX archive, no courses found.")
    finally:
        default_storage.delete(filename)
        rmtree(tempdir)


[docs]def import_course_from_path(path, repo_id, user_id):
    """
    Import course from an OLX directory.

    Args:
        path (unicode): Path to extracted OLX tree
        repo_id (int): Primary key of repository course belongs to
        user_id (int): Primary key of Django user doing the import
    Returns:
        course (learningresources.Course)
    """
    bundle = XBundle(
        keep_urls=True, keep_studio_urls=True, preserve_url_name=True
    )
    bundle.import_from_directory(path)
    static_dir = join(path, 'static')
    with transaction.atomic():
        course = import_course(bundle, repo_id, user_id, static_dir)
    return course


[docs]def import_course(bundle, repo_id, user_id, static_dir):
    """
    Import a course from an XBundle object.

    Args:
        bundle (xbundle.XBundle): Course as xbundle XML
        repo_id (int): Primary key of repository course belongs to
        user_id (int): Primary key of Django user doing the import
        static_dir (unicode): location of static files
    Returns:
        learningresources.models.Course
    """
    src = bundle.course
    course = create_course(
        org=src.attrib["org"],
        repo_id=repo_id,
        course_number=src.attrib["course"],
        run=src.attrib["semester"],
        user_id=user_id,
    )
    import_static_assets(course, static_dir)
    import_children(course, src, None, '')
    populate_xanalytics_fields.delay(course.id)
    # This triggers a bulk indexing of all LearningResource instances
    # for the course at once.
    index_resources(
        get_resources(repo_id).filter(
            course__id=course.id).values_list("id", flat=True))
    return course


[docs]def is_leaf_tag(tag):
    """
    Should we look for resources within elements with this tag?

    Args:
        tag (unicode): Element tag
    Returns:
        bool: Whether tag is leaf tag
    """
    return tag in {'video', 'html', 'problem', 'discussion'}


# pylint: disable=too-many-branches
[docs]def import_children(course, element, parent, parent_dpath):
    """
    Create LearningResource instances for each element
    of an XML tree.

    Args:
        course (learningresources.models.Course): Course
        element (lxml.etree): XML element within xbundle
        parent (learningresources.models.LearningResource):
            Parent LearningResource
        parent_dpath (unicode): parent description path
    Returns:
        None
    """
    # pylint: disable=too-many-locals
    title = element.attrib.get(
        "display_name", MissingTitle.for_title_field)
    desc_path = title
    if desc_path == MissingTitle.for_title_field:
        desc_path = MissingTitle.for_desc_path_field
    mpath = etree.ElementTree(element).getpath(element)
    dpath = join_description_paths(parent_dpath, desc_path)
    url_name = element.attrib.get(
        "url_name",
        element.attrib.get("display_name", None)
    )
    resource = create_resource(
        course=course, parent=parent, resource_type=element.tag,
        title=title,
        content_xml=etree.tostring(element),
        mpath=mpath,
        url_name=url_name,
        dpath=dpath,
    )
    # temp variable to store static assets for bulk insert
    static_assets_to_save = set()
    target = "/static/"
    if element.tag == "video":  # pylint: disable=too-many-nested-blocks
        subname = get_video_sub(element)
        if subname != "":
            assets = StaticAsset.objects.filter(
                course__id=resource.course_id,
                asset=course_asset_basepath(course, subname),
            )
            for asset in assets:
                static_assets_to_save.add((resource, asset))
    else:
        # Recursively find all sub-elements, looking for anything which
        # refers to /static/. Then make the association between the
        # LearningResource and StaticAsset if the StaticAsset exists.
        # This is like doing soup.findAll("a") and checking for whether
        # "/static/" is in the href, which would work but also requires
        # more code to check for link, img, iframe, script, and others,
        # and within those, check for href or src existing.
        soup = BeautifulSoup(etree.tostring(element), 'lxml')
        for child in soup.findAll():
            for _, val in child.attrs.items():
                try:
                    if val.startswith(target):
                        path = val[len(target):]
                        try:
                            asset = StaticAsset.objects.get(
                                course__id=resource.course_id,
                                asset=course_asset_basepath(course, path),
                            )
                            static_assets_to_save.add((resource, asset))
                        except StaticAsset.DoesNotExist:
                            continue
                except AttributeError:
                    continue  # not a string
    # Bulk insert of static assets
    # Using this approach to avoid signals during the learning resource .save()
    # Each signal triggers a reindex of the learning resource that is useless
    # during import because all the learning resources are indexed in bulk at
    # the end of the import anyway
    ThroughModel = LearningResource.static_assets.through
    ThroughModel.objects.bulk_create(
        [
            ThroughModel(
                learningresource_id=resource.id,
                staticasset_id=asset.id
            )
            for resource, asset in static_assets_to_save
        ]
    )

    # Try to protect against bad data, specifically <problem><problem>...
    # imports. The two tags will still appear in content_xml but there will
    # be only one resource for the outer one.
    if not is_leaf_tag(element.tag):
        for child in element.getchildren():
            if child.tag in DESCRIPTOR_TAGS:
                import_children(course, child, resource, dpath)