450 lines
13 KiB
C#
450 lines
13 KiB
C#
using System.Globalization;
|
|
using HtmlAgilityPack;
|
|
using System.Web;
|
|
using OpenQA.Selenium.Chrome;
|
|
using OpenQA.Selenium.Support.UI;
|
|
using UWLib;
|
|
using OpenQA.Selenium;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using System.Text;
|
|
using System.IO;
|
|
using System.Text.RegularExpressions;
|
|
using Microsoft.EntityFrameworkCore.Design;
|
|
using System.Reflection;
|
|
using System.ComponentModel;
|
|
|
|
namespace UWScraper
|
|
{
|
|
|
|
internal static class Scraper
|
|
{
|
|
public class ScrapedEventArgs : EventArgs
|
|
{
|
|
public int Scraped { get; set; }
|
|
public int ToScrape { get; set; }
|
|
}
|
|
|
|
readonly static List<string> checkedUrls = [];
|
|
readonly static List<string> urlsToCheck = [];
|
|
|
|
static bool refreshLectures = false;
|
|
static uint timeout = 2;
|
|
static string semester = "";
|
|
|
|
static LectureContext db = new("lecture.db");
|
|
static readonly ChromeDriver? driver = null;
|
|
|
|
public delegate void ScrapedEventHandler(ScrapedEventArgs e);
|
|
public delegate void InitEventHandler();
|
|
|
|
public static event ScrapedEventHandler? Scraped;
|
|
|
|
public static event ScrapedEventHandler? Init;
|
|
|
|
static internal void QuitDriver()
|
|
{
|
|
driver?.Quit();
|
|
}
|
|
|
|
static Scraper()
|
|
{
|
|
var service = ChromeDriverService.CreateDefaultService();
|
|
service.HideCommandPromptWindow = true;
|
|
|
|
var options = new ChromeOptions();
|
|
//options.AddArgument("--headless");
|
|
driver = new ChromeDriver(service, options);
|
|
}
|
|
|
|
static internal void Scrape(ScrapeCommand.Settings settings)
|
|
{
|
|
db = new LectureContext(settings.DatabasePath);
|
|
|
|
Scraper.timeout = settings.Timeout;
|
|
Scraper.refreshLectures = settings.RefreshLectures;
|
|
|
|
if (settings.ClearLinksToScrape)
|
|
{
|
|
db.LinksToScrape.RemoveRange(db.LinksToScrape.ToList());
|
|
db.SaveChanges();
|
|
}
|
|
|
|
if (settings.Semester != null)
|
|
{
|
|
semester = settings.Semester;
|
|
}
|
|
|
|
if (!refreshLectures)
|
|
{
|
|
urlsToCheck.AddRange([.. db.LinksToScrape.OrderBy(x => x.Sort).Select(x => x.Url)]);
|
|
|
|
//add all urls that were checked in the last 24 hours
|
|
checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-settings.RescrapeHours)).Select(x => x.Url)]);
|
|
|
|
//remove all checked urls from the urls to check
|
|
foreach (var url in checkedUrls)
|
|
{
|
|
urlsToCheck.Remove(url);
|
|
}
|
|
|
|
if (urlsToCheck.Count == 0)
|
|
{
|
|
// add the first url to check / Vorlesungsverzeichnis
|
|
var defaultValueAttr = settings.GetType().GetCustomAttribute<DefaultValueAttribute>();
|
|
|
|
if (defaultValueAttr != null && settings.StartUrl == defaultValueAttr.Value?.ToString())
|
|
{
|
|
urlsToCheck.Add($"{settings.StartUrl}+?semester={semester}");
|
|
}
|
|
else
|
|
{
|
|
urlsToCheck.Add(settings.StartUrl);
|
|
}
|
|
|
|
|
|
|
|
}
|
|
}
|
|
else
|
|
{
|
|
urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]);
|
|
}
|
|
|
|
Console.CursorVisible = false;
|
|
|
|
int top = Console.CursorTop;
|
|
|
|
Init?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count });
|
|
|
|
while (urlsToCheck.Count > 0)
|
|
{
|
|
var url = urlsToCheck.First();
|
|
|
|
ScrapeUrl(url);
|
|
|
|
Scraped?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count });
|
|
}
|
|
}
|
|
|
|
static string GetAbsoluteUrl(string relativeUrl, string baseUrl)
|
|
{
|
|
var uri = new Uri(baseUrl);
|
|
var baseUri = new Uri(uri, relativeUrl);
|
|
|
|
var absolute = HttpUtility.HtmlDecode(baseUri.AbsoluteUri);
|
|
|
|
var sanitized = SanitizeUrl(absolute);
|
|
|
|
return sanitized;
|
|
}
|
|
|
|
public static string SanitizeUrl(string url)
|
|
{
|
|
string[] filter = ["from", "to", "details"];
|
|
|
|
Uri uri = new(url);
|
|
|
|
url = uri.GetLeftPart(UriPartial.Path);
|
|
|
|
if (!string.IsNullOrWhiteSpace(uri.Query))
|
|
{
|
|
var query = HttpUtility.ParseQueryString(uri.Query);
|
|
|
|
StringBuilder builder = new();
|
|
|
|
builder.Append('?');
|
|
foreach (var item in query.AllKeys.Where(x => !filter.Contains(x)).OrderBy(x => x))
|
|
{
|
|
builder.Append($"{item}={query[item]}");
|
|
builder.Append('&');
|
|
}
|
|
|
|
builder.Length--;
|
|
|
|
var sortedQuery = builder.ToString();
|
|
|
|
url += sortedQuery;
|
|
}
|
|
|
|
return url;
|
|
}
|
|
|
|
private static void ScrapeUrl(string source)
|
|
{
|
|
Navigate(source);
|
|
|
|
RefreshScrapedLink(source);
|
|
RemoveLinkToScrape(source);
|
|
db.SaveChanges();
|
|
|
|
checkedUrls.Add(source);
|
|
urlsToCheck.Remove(source);
|
|
}
|
|
|
|
private static void Navigate(string source)
|
|
{
|
|
try
|
|
{
|
|
|
|
var s = GetSemesterOfUrl(source);
|
|
|
|
if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s)
|
|
{
|
|
return;
|
|
}
|
|
driver?.Navigate().GoToUrl(source);
|
|
|
|
// wait for the page to load
|
|
WebDriverWait wait = new(driver, TimeSpan.FromSeconds(timeout));
|
|
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
|
|
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
|
|
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
HtmlDocument doc = new();
|
|
doc.LoadHtml(driver?.PageSource);
|
|
var root = doc.DocumentNode;
|
|
|
|
var semesterNode = root.SelectSingleNode("/html/body/main/nav/span[@class='current']");
|
|
|
|
if (semesterNode != null)
|
|
{
|
|
s = semesterNode.InnerText;
|
|
if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s)
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if (source.Contains("course.html"))
|
|
{
|
|
CreateLecture(source, root);
|
|
}
|
|
|
|
if (!refreshLectures)
|
|
{
|
|
FindPathLinks(source, root);
|
|
FindCourseLinks(source, root);
|
|
}
|
|
|
|
}
|
|
catch (Exception)
|
|
{
|
|
}
|
|
}
|
|
|
|
private static string GetSemesterOfUrl(string source)
|
|
{
|
|
Uri uri = new(source);
|
|
|
|
if (uri.Query.Contains("semester"))
|
|
{
|
|
var query = HttpUtility.ParseQueryString(uri.Query);
|
|
|
|
if (query.AllKeys.Contains("semester"))
|
|
{
|
|
return query["semester"] ?? string.Empty;
|
|
}
|
|
}
|
|
|
|
return string.Empty;
|
|
}
|
|
|
|
private static void RefreshScrapedLink(string source)
|
|
{
|
|
var scrapedLink = db.ScrapedLinks.Find(source);
|
|
if (scrapedLink == null)
|
|
{
|
|
scrapedLink = new ScrapedLink { Url = source };
|
|
db.ScrapedLinks.Add(scrapedLink);
|
|
}
|
|
scrapedLink.LastScrape = DateTime.Now;
|
|
}
|
|
|
|
private static void FindCourseLinks(string source, HtmlNode root)
|
|
{
|
|
var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/..");
|
|
if (courseLinkParents != null)
|
|
{
|
|
foreach (var parent in courseLinkParents)
|
|
{
|
|
var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]");
|
|
|
|
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
|
|
|
|
if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url))
|
|
{
|
|
AddLinkToScrape(url, 0);
|
|
urlsToCheck.Insert(0, url);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void FindPathLinks(string source, HtmlNode root)
|
|
{
|
|
var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]");
|
|
|
|
if (pathLinks != null)
|
|
{
|
|
foreach (var link in pathLinks)
|
|
{
|
|
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
|
|
|
|
if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url))
|
|
{
|
|
AddLinkToScrape(url, 1);
|
|
urlsToCheck.Add(url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void RemoveLinkToScrape(string url)
|
|
{
|
|
var link = db.LinksToScrape.Find(url);
|
|
if (link != null)
|
|
{
|
|
db.LinksToScrape.Remove(link);
|
|
}
|
|
}
|
|
|
|
private static void AddLinkToScrape(string url, int sort)
|
|
{
|
|
var link = db.LinksToScrape.Find(url);
|
|
if (link == null)
|
|
{
|
|
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort });
|
|
}
|
|
}
|
|
|
|
private static void CreateLecture(string source, HtmlNode root)
|
|
{
|
|
//ingoreLectureTypes
|
|
|
|
var branch = root.SelectSingleNode("//*[@class='spl']");
|
|
var number = root.SelectSingleNode("//*[@class='title']//*[@class='number']");
|
|
var type = root.SelectSingleNode("//*[@class='title']//*[@class='type']");
|
|
var what = root.SelectSingleNode("//*[@class='title']//*[@class='what']");
|
|
var when = root.SelectSingleNode("//*[@class='title']//*[@class='when']");
|
|
var info = root.SelectSingleNode("//*[@class='info list']");
|
|
var events = root.SelectNodes("//ul[@class='classes events list']/li");
|
|
|
|
var uri = new Uri(source);
|
|
|
|
var query = HttpUtility.ParseQueryString(uri.Query);
|
|
|
|
int id = 0;
|
|
string semester = string.Empty;
|
|
|
|
if (number != null)
|
|
{
|
|
id = int.Parse(number.InnerText);
|
|
}
|
|
if (query.AllKeys.Contains("lv"))
|
|
{
|
|
id = int.Parse(query["lv"] ?? "0");
|
|
}
|
|
|
|
if (when != null)
|
|
{
|
|
semester = when.InnerText;
|
|
}
|
|
else if (query.AllKeys.Contains("semester"))
|
|
{
|
|
semester = query["semester"] ?? "";
|
|
}
|
|
|
|
Lecture? lecture = db.Lectures.Include(x => x.Events).FirstOrDefault(db => db.Id == id && db.Semester == semester);
|
|
|
|
if (lecture == null)
|
|
{
|
|
lecture = new Lecture
|
|
{
|
|
Id = id,
|
|
|
|
Semester = semester
|
|
};
|
|
|
|
db.Lectures.Add(lecture);
|
|
}
|
|
|
|
lecture.Url = source;
|
|
|
|
if (branch != null)
|
|
{
|
|
lecture.Branch = branch.InnerText;
|
|
}
|
|
|
|
if (what != null)
|
|
{
|
|
lecture.Title = what.InnerText;
|
|
}
|
|
|
|
if (info != null)
|
|
{
|
|
lecture.Description = info.InnerHtml;
|
|
}
|
|
|
|
if (type != null)
|
|
{
|
|
lecture.Type = type.GetAttributeValue("title", "");
|
|
}
|
|
|
|
if (events != null)
|
|
{
|
|
CreateLectureEvents(events, lecture);
|
|
}
|
|
}
|
|
|
|
private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture)
|
|
{
|
|
int year = int.Parse(lecture.Semester[..4]);
|
|
|
|
db.RemoveRange(lecture.Events);
|
|
lecture.Events.Clear();
|
|
foreach (var item in events)
|
|
{
|
|
LectureEvent lectureEvent = new() { Lecture = lecture };
|
|
|
|
var day = item.SelectSingleNode("*[@class='date']");
|
|
var time = item.SelectSingleNode("*[@class='time']");
|
|
var room = item.SelectSingleNode("*[@class='room']");
|
|
DateTime date = new();
|
|
if (day != null)
|
|
{
|
|
date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
|
|
}
|
|
|
|
if (time != null)
|
|
{
|
|
var text = time.InnerText;
|
|
|
|
var times = text.Split(" - ");
|
|
|
|
var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture);
|
|
var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture);
|
|
|
|
lectureEvent.From = date.Add(from);
|
|
lectureEvent.To = date.Add(to);
|
|
}
|
|
|
|
if (room != null)
|
|
{
|
|
lectureEvent.Location = room.InnerText;
|
|
}
|
|
|
|
lecture.Events.Add(lectureEvent);
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|