Lectureplan/UWScraper/Program.cs
Robin Weichselbraun 824df98750 Add project files.
2024-10-12 14:04:23 +02:00

182 lines
5.3 KiB
C#

using System.Globalization;
using System.Net.NetworkInformation;
using System.Web;
using System.Xml.Linq;
using OpenQA.Selenium;
using OpenQA.Selenium.BiDi.Modules.Script;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.Support.UI;
using UWLib;
using static Microsoft.EntityFrameworkCore.DbLoggerCategory;
internal class Program
{
static List<string> checkedUrls = new List<string>();
static List<string> urlsToCheck = new List<string>();
static LectureContext db = new LectureContext();
static IWebDriver driver = null;
private static void Main(string[] args)
{
var options = new ChromeOptions();
//options.AddArgument("--headless");
driver = new ChromeDriver(options);
checkedUrls.AddRange(db.Lectures.Select(x => x.Url).ToList());
urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html");
while (urlsToCheck.Count > 0)
{
var url = urlsToCheck.First();
FindUrls(url);
}
}
private static void FindUrls(string source)
{
if (checkedUrls.Contains(source))
{
urlsToCheck.Remove(source);
return;
}
if (db.Lectures.Any(x=>x.Url == source))
{
checkedUrls.Add(source);
urlsToCheck.Remove(source);
return;
}
driver.Navigate().GoToUrl(source);
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(2));
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
);
if (source.Contains("course.html"))
{
Lecture lecture = new Lecture();
lecture.Url = source;
var uri = new Uri(source);
var query = HttpUtility.ParseQueryString(uri.Query);
if (query.AllKeys.Contains("lv"))
{
lecture.Id = int.Parse(query["lv"]);
}
int year = DateTime.Now.Year;
var what = driver.FindElements(By.XPath("//h1/*/*[@class='what']"));
var when = driver.FindElements(By.XPath("//h1/*/*[@class='when']"));
var info = driver.FindElements(By.XPath("//*[@class='info list']"));
var events = driver.FindElements(By.XPath("//ul[@class='classes events list']/li"));
if (what.Count > 0)
{
lecture.Title = what.First().Text;
}
if (when.Count > 0)
{
lecture.Semester = when.First().Text;
year = int.Parse(when.First().Text.Substring(0, 4));
}
if (info.Count > 0)
{
lecture.Description = info.First().GetAttribute("innerHTML");
}
foreach (var item in events)
{
LectureEvent lectureEvent = new LectureEvent();
var day = item.FindElements(By.XPath("*[@class='date']"));
var time = item.FindElements(By.XPath("*[@class='time']"));
var room = item.FindElements(By.XPath("*[@class='room']"));
DateTime date = new DateTime();
if (day.Count > 0)
{
date = DateTime.ParseExact(day.First().Text + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
}
if (time.Count > 0)
{
var text = time.First().Text;
var times = text.Split(" - ");
var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture);
var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture);
lectureEvent.From = date.Add(from);
lectureEvent.To = date.Add(to);
}
if (room.Count > 0)
{
lectureEvent.Location = room.First().Text;
}
lecture.Events.Add(lectureEvent);
}
try
{
db.Lectures.Add(lecture);
}
catch (Exception e)
{
}
db.SaveChanges();
}
var pathLinks = driver.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]"));
foreach (var link in pathLinks)
{
var url = link.GetAttribute("href");
if (!checkedUrls.Contains(url))
{
urlsToCheck.Add(link.GetAttribute("href"));
}
}
var courseLinkParents = driver.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]/.."));
foreach (var parent in courseLinkParents)
{
if (parent.FindElements(By.XPath("abbr[@title='Vorlesung']")).Count > 0)
{
var link = parent.FindElement(By.XPath("a[starts-with(@href, 'course.html')]"));
var url = link.GetAttribute("href");
if (!checkedUrls.Contains(url))
{
urlsToCheck.Insert(0, link.GetAttribute("href"));
}
}
}
// driver.Close();
checkedUrls.Add(source);
urlsToCheck.Remove(source);
}
}