test_parsing.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import json

import pytest

from edx_dl.common import DEFAULT_FILE_FORMATS

from edx_dl.parsing import (
    edx_json2srt,
    ClassicEdXPageExtractor,
    CurrentEdXPageExtractor,
    is_youtube_url,
)


# Test conversion of JSON subtitles to srt
def test_empty_json_subtitle():
    with open('test/json/empty.json') as f:
        json_string = f.read()
    with pytest.raises(ValueError):
        json_contents = json.loads(json_string)


@pytest.mark.parametrize(
    'file,expected', [
        ('test/json/empty-text.json', ''),
        ('test/json/minimal.json', ''),
        ('test/json/abridged-01.json', ('0\n'
                                        '00:00:18,104 --> 00:00:20,428\n'
                                        'I am very glad to see everyone here，\n\n')),
        ('test/json/abridged-02.json', ('0\n'
                                        '00:00:18,104 --> 00:00:20,428\n'
                                        'I am very glad to see everyone here，\n\n'
                                        '1\n'
                                        '00:00:20,569 --> 00:00:24,721\n'
                                        'so let\'s enjoy the beauty of combinatorics together.\n\n'))
    ]
)
def test_subtitles_from_json(file, expected):
    with open(file) as f:
        json_contents = json.loads(f.read())
    res = edx_json2srt(json_contents)
    assert res == expected


# Test extraction of video/other assets from HTML
def test_extract_units_from_html_single_unit_multiple_subs():
    site = 'https://courses.edx.org'
    with open("test/html/single_unit_multiple_subs.html", "r") as f:
        units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
                                                                  site,
                                                                  DEFAULT_FILE_FORMATS)

        assert units[0].videos[0].video_youtube_url == 'https://youtube.com/watch?v=b7xgknqkQk8'
        assert units[0].videos[0].mp4_urls[0] == 'https://d2f1egay8yehza.cloudfront.net/edx-edx101/EDXSPCPJSP13-H010000_100.mp4'
        assert units[0].videos[0].sub_template_url == 'https://courses.edx.org/courses/edX/DemoX.1/2014/xblock/i4x:;_;_edX;_DemoX.1;_video;_14459340170c476bb65f73a0a08a076f/handler/transcript/translation/%s'


def test_extract_multiple_units_multiple_resources():
    site = 'https://courses.edx.org'
    with open("test/html/multiple_units.html", "r") as f:
        units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
                                                                  site,
                                                                  DEFAULT_FILE_FORMATS)
        assert len(units) == 3
        # this one has multiple speeds in the data-streams field
        assert 'https://youtube.com/watch?v=CJ482b9r_0g' in [video.video_youtube_url for video in units[0].videos]
        assert len(units[0].videos[0].mp4_urls) > 0
        assert 'https://s3.amazonaws.com/berkeley-cs184x/videos/overview-motivation.mp4' in units[0].videos[0].mp4_urls
        assert 'https://courses.edx.org/static/content-berkeley-cs184x~2012_Fall/slides/overview.pdf' in units[0].resources_urls


def test_extract_multiple_units_no_youtube_ids():
    site = 'https://courses.edx.org'
    with open("test/html/multiple_units_no_youtube_ids.html", "r") as f:
        units = ClassicEdXPageExtractor().extract_units_from_html(f.read(),
                                                                  site,
                                                                  DEFAULT_FILE_FORMATS)
        assert units[0].videos[0].video_youtube_url is None
        assert len(units[0].videos[0].mp4_urls) > 0


def test_extract_multiple_units_youtube_link():
    site = 'https://courses.edx.org'
    with open("test/html/multiple_units_youtube_link.html", "r") as f:
        units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
                                                                  site,
                                                                  DEFAULT_FILE_FORMATS)
        assert 'https://www.youtube.com/watch?v=5OXQypOAbdI' in units[0].resources_urls


def test_extract_multiple_units_multiple_youtube_videos():
    site = 'https://courses.edx.org'
    with open("test/html/multiple_units_multiple_youtube_videos.html", "r") as f:
        units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
                                                                  site,
                                                                  DEFAULT_FILE_FORMATS)
        assert len(units[0].videos) == 3
        assert 'https://youtube.com/watch?v=3atHHNa2UwI' in [video.video_youtube_url for video in units[0].videos]


@pytest.mark.parametrize(
    'file,num_sections_expected,num_subsections_expected', [
        ('test/html/new_sections_structure.html', 2, 12),
        ('test/html/empty_sections.html', 0, 0)
    ]
)
def test_extract_sections(file, num_sections_expected, num_subsections_expected):
    site = 'https://courses.edx.org'
    with open(file, "r") as f:
        sections = CurrentEdXPageExtractor().extract_sections_from_html(f.read(), site)
        assert len(sections) == num_sections_expected
        num_subsections = sum(len(section.subsections) for section in sections)
        assert num_subsections == num_subsections_expected


@pytest.mark.parametrize(
    'filename,site,num_courses_expected,num_available_courses_expected', [
        ('test/html/dashboard-version-with-articles.html', 'https://courses.edx.org', 18, 14),
        ('test/html/dashboard-version-with-divs.html', 'https://courses.edx.org', 18, 14),
    ]
)
def test_extract_courses_from_html(filename, site, num_courses_expected, num_available_courses_expected):
    with open(filename, "r") as f:
        courses = CurrentEdXPageExtractor().extract_courses_from_html(f.read(), site)
        assert len(courses) == num_courses_expected
        available_courses = [course for course in courses if course.state == 'Started']
        assert len(available_courses) == num_available_courses_expected


def test_is_youtube_url():
    invalid_urls = [
        'http://www.google.com/', 'TODO',
        'https://d2f1egay8yehza.cloudfront.net/mit-24118/MIT24118T314-V015000_DTH.mp4',
        'https://courses.edx.org/courses/course-v1:MITx+24.118x+2T2015/xblock/block-v1:MITx+24.118x+2T2015+type@video+block@b1588e7cccff4d448f4f9676c81184d9/handler/transcript/available_translations'
    ]
    valid_urls = [
        'http://www.youtu.be/rjOpZ3i6pRo',
        'http://www.youtube.com/watch?v=rjOpZ3i6pRo',
        'http://youtu.be/rjOpZ3i6pRo',
        'http://youtube.com/watch?v=rjOpZ3i6pRo',
        'https://www.youtu.be/rjOpZ3i6pRo',
        'https://www.youtube.com/watch?v=rjOpZ3i6pRo',
        'https://youtu.be/rjOpZ3i6pRo',
        'https://youtube.com/watch?v=rjOpZ3i6pRo',
    ]
    for url in invalid_urls:
        assert not is_youtube_url(url)
    for url in valid_urls:
        assert is_youtube_url(url)