Performance update with HTMLAgilityPack

Added Branches
Added LinksToScrape
Added ScrapedLinks
This commit is contained in:
Robin Weichselbraun 2024-10-12 17:36:23 +02:00
parent 824df98750
commit 03ddeba846
18 changed files with 843 additions and 154 deletions

View File

@ -25,6 +25,7 @@ namespace UWLecturePlan.Controllers
model.LectureEvents = db.LectureEvents.Include(x=>x.Lecture)
.Where(x => x.From >= from)
.Where(x => x.Lecture.Semester == model.CurrentSemester)
.Where(x => x.Lecture.Branch == model.BranchFilter || model.BranchFilter == null)
.ToList();
if (model.LocationFilter != null)
@ -33,6 +34,8 @@ namespace UWLecturePlan.Controllers
.Where(x => x.Location.Contains(model.LocationFilter)).ToList();
}
model.Branches = db.LectureEvents.Select(x => x.Lecture.Branch).Distinct().OrderBy(x=>x).ToList();
return View(model);
}

View File

@ -7,8 +7,12 @@ namespace UWLecturePlan.Models
public string CurrentSemester { get; set; }
public string LocationFilter { get; set; }
public string? LocationFilter { get; set; }
public string? BranchFilter { get; set; }
public List<LectureEvent> LectureEvents { get; set; }
public List<string> Branches { get; set; }
}
}

View File

@ -1,4 +1,21 @@
@model LecturesViewModel
@using System.Text.RegularExpressions
@model LecturesViewModel
@{
string GetBranchName(string branch)
{
Regex regex = new Regex(@".* - (.*)");
var match = regex.Match(branch);
if (match.Success)
{
return match.Groups[1].Value;
}
else
{
return branch;
}
}
}
<style>
@ -30,9 +47,17 @@
<form method="get" >
Semester: <input type="text" asp-for="CurrentSemester" style="width:3.5em;text-align:center;" maxlength="5" />
Ort: <input type="text" asp-for="LocationFilter" />
Studiengang:
<select type="text" asp-for="BranchFilter" >
<option value="">Alle</option>
@foreach (var branch in Model.Branches.OrderBy(x => GetBranchName(x)))
{
<option value="@branch">
@GetBranchName(branch)
</option>
}
</select>
<input type="submit" value="Filter" />
</form>

View File

@ -12,11 +12,11 @@ namespace UWLib
public class Lecture
{
public int Id { get; set; }
public string Semester { get; set; }
public string Title { get; set; }
public string Url { get; set; }
public string? Description { get; set; }
public List<LectureEvent> Events { get; set; } = new List<LectureEvent>();
public string Branch { get; set; }
}
}

View File

@ -12,6 +12,10 @@ namespace UWLib
public DbSet<Lecture> Lectures { get; set; }
public DbSet<LectureEvent> LectureEvents { get; set; }
public DbSet<ScrapedLink> ScrapedLinks { get; set; }
public DbSet<LinkToScrape> LinksToScrape { get; set; }
public string DbPath { get; }
@ -19,7 +23,7 @@ namespace UWLib
public LectureContext(string path)
{
DbPath = path;
this.Database.Migrate();
Database.Migrate();
}
public LectureContext()
@ -29,7 +33,7 @@ namespace UWLib
DbPath = System.IO.Path.Join(path, "lecture.db");
DbPath = "lecture.db";
this.Database.Migrate();
Database.Migrate();
}
// The following configures EF to create a Sqlite database file in the

16
UWLib/LinkToScrape.cs Normal file
View File

@ -0,0 +1,16 @@
using System;
using System.Collections.Generic;
using System.ComponentModel.DataAnnotations;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace UWLib
{
public class LinkToScrape
{
[Key]
public string Url { get; set; }
public int Sort { get; set; }
}
}

View File

@ -0,0 +1,108 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using UWLib;
#nullable disable
namespace UWLib.Migrations
{
[DbContext(typeof(LectureContext))]
[Migration("20241012131142_ScrapedLinks")]
partial class ScrapedLinks
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder.HasAnnotation("ProductVersion", "8.0.10");
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Property<int>("Id")
.HasColumnType("INTEGER");
b.Property<string>("Semester")
.HasColumnType("TEXT");
b.Property<string>("Description")
.HasColumnType("TEXT");
b.Property<string>("Title")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Url")
.IsRequired()
.HasColumnType("TEXT");
b.HasKey("Id", "Semester");
b.ToTable("Lectures");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.Property<int>("Id")
.ValueGeneratedOnAdd()
.HasColumnType("INTEGER");
b.Property<DateTime>("From")
.HasColumnType("TEXT");
b.Property<int>("LectureId")
.HasColumnType("INTEGER");
b.Property<string>("LectureSemester")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Location")
.IsRequired()
.HasColumnType("TEXT");
b.Property<DateTime>("To")
.HasColumnType("TEXT");
b.HasKey("Id");
b.HasIndex("LectureId", "LectureSemester");
b.ToTable("LectureEvents");
});
modelBuilder.Entity("UWLib.ScrapedLink", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<DateTime>("LastScrape")
.HasColumnType("TEXT");
b.HasKey("Url");
b.ToTable("ScrapedLinks");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.HasOne("UWLib.Lecture", "Lecture")
.WithMany("Events")
.HasForeignKey("LectureId", "LectureSemester")
.OnDelete(DeleteBehavior.Cascade)
.IsRequired();
b.Navigation("Lecture");
});
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Navigation("Events");
});
#pragma warning restore 612, 618
}
}
}

View File

@ -0,0 +1,34 @@
using System;
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace UWLib.Migrations
{
/// <inheritdoc />
public partial class ScrapedLinks : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.CreateTable(
name: "ScrapedLinks",
columns: table => new
{
Url = table.Column<string>(type: "TEXT", nullable: false),
LastScrape = table.Column<DateTime>(type: "TEXT", nullable: false)
},
constraints: table =>
{
table.PrimaryKey("PK_ScrapedLinks", x => x.Url);
});
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropTable(
name: "ScrapedLinks");
}
}
}

View File

@ -0,0 +1,112 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using UWLib;
#nullable disable
namespace UWLib.Migrations
{
[DbContext(typeof(LectureContext))]
[Migration("20241012131329_Branch")]
partial class Branch
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder.HasAnnotation("ProductVersion", "8.0.10");
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Property<int>("Id")
.HasColumnType("INTEGER");
b.Property<string>("Semester")
.HasColumnType("TEXT");
b.Property<string>("Branch")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Description")
.HasColumnType("TEXT");
b.Property<string>("Title")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Url")
.IsRequired()
.HasColumnType("TEXT");
b.HasKey("Id", "Semester");
b.ToTable("Lectures");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.Property<int>("Id")
.ValueGeneratedOnAdd()
.HasColumnType("INTEGER");
b.Property<DateTime>("From")
.HasColumnType("TEXT");
b.Property<int>("LectureId")
.HasColumnType("INTEGER");
b.Property<string>("LectureSemester")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Location")
.IsRequired()
.HasColumnType("TEXT");
b.Property<DateTime>("To")
.HasColumnType("TEXT");
b.HasKey("Id");
b.HasIndex("LectureId", "LectureSemester");
b.ToTable("LectureEvents");
});
modelBuilder.Entity("UWLib.ScrapedLink", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<DateTime>("LastScrape")
.HasColumnType("TEXT");
b.HasKey("Url");
b.ToTable("ScrapedLinks");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.HasOne("UWLib.Lecture", "Lecture")
.WithMany("Events")
.HasForeignKey("LectureId", "LectureSemester")
.OnDelete(DeleteBehavior.Cascade)
.IsRequired();
b.Navigation("Lecture");
});
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Navigation("Events");
});
#pragma warning restore 612, 618
}
}
}

View File

@ -0,0 +1,29 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace UWLib.Migrations
{
/// <inheritdoc />
public partial class Branch : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<string>(
name: "Branch",
table: "Lectures",
type: "TEXT",
nullable: false,
defaultValue: "");
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "Branch",
table: "Lectures");
}
}
}

View File

@ -0,0 +1,125 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using UWLib;
#nullable disable
namespace UWLib.Migrations
{
[DbContext(typeof(LectureContext))]
[Migration("20241012140426_LinksToScrape")]
partial class LinksToScrape
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder.HasAnnotation("ProductVersion", "8.0.10");
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Property<int>("Id")
.HasColumnType("INTEGER");
b.Property<string>("Semester")
.HasColumnType("TEXT");
b.Property<string>("Branch")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Description")
.HasColumnType("TEXT");
b.Property<string>("Title")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Url")
.IsRequired()
.HasColumnType("TEXT");
b.HasKey("Id", "Semester");
b.ToTable("Lectures");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.Property<int>("Id")
.ValueGeneratedOnAdd()
.HasColumnType("INTEGER");
b.Property<DateTime>("From")
.HasColumnType("TEXT");
b.Property<int>("LectureId")
.HasColumnType("INTEGER");
b.Property<string>("LectureSemester")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Location")
.IsRequired()
.HasColumnType("TEXT");
b.Property<DateTime>("To")
.HasColumnType("TEXT");
b.HasKey("Id");
b.HasIndex("LectureId", "LectureSemester");
b.ToTable("LectureEvents");
});
modelBuilder.Entity("UWLib.LinkToScrape", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<int>("Sort")
.HasColumnType("INTEGER");
b.HasKey("Url");
b.ToTable("LinksToScrape");
});
modelBuilder.Entity("UWLib.ScrapedLink", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<DateTime>("LastScrape")
.HasColumnType("TEXT");
b.HasKey("Url");
b.ToTable("ScrapedLinks");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.HasOne("UWLib.Lecture", "Lecture")
.WithMany("Events")
.HasForeignKey("LectureId", "LectureSemester")
.OnDelete(DeleteBehavior.Cascade)
.IsRequired();
b.Navigation("Lecture");
});
modelBuilder.Entity("UWLib.Lecture", b =>
{
b.Navigation("Events");
});
#pragma warning restore 612, 618
}
}
}

View File

@ -0,0 +1,33 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace UWLib.Migrations
{
/// <inheritdoc />
public partial class LinksToScrape : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.CreateTable(
name: "LinksToScrape",
columns: table => new
{
Url = table.Column<string>(type: "TEXT", nullable: false),
Sort = table.Column<int>(type: "INTEGER", nullable: false)
},
constraints: table =>
{
table.PrimaryKey("PK_LinksToScrape", x => x.Url);
});
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropTable(
name: "LinksToScrape");
}
}
}

View File

@ -25,6 +25,10 @@ namespace UWLib.Migrations
b.Property<string>("Semester")
.HasColumnType("TEXT");
b.Property<string>("Branch")
.IsRequired()
.HasColumnType("TEXT");
b.Property<string>("Description")
.HasColumnType("TEXT");
@ -71,6 +75,32 @@ namespace UWLib.Migrations
b.ToTable("LectureEvents");
});
modelBuilder.Entity("UWLib.LinkToScrape", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<int>("Sort")
.HasColumnType("INTEGER");
b.HasKey("Url");
b.ToTable("LinksToScrape");
});
modelBuilder.Entity("UWLib.ScrapedLink", b =>
{
b.Property<string>("Url")
.HasColumnType("TEXT");
b.Property<DateTime>("LastScrape")
.HasColumnType("TEXT");
b.HasKey("Url");
b.ToTable("ScrapedLinks");
});
modelBuilder.Entity("UWLib.LectureEvent", b =>
{
b.HasOne("UWLib.Lecture", "Lecture")

17
UWLib/ScrapedLink.cs Normal file
View File

@ -0,0 +1,17 @@
using System;
using System.Collections.Generic;
using System.ComponentModel.DataAnnotations;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace UWLib
{
public class ScrapedLink
{
[Key]
public string Url{ get; set; }
public DateTime LastScrape { get; set; }
}
}

View File

@ -1,11 +1,15 @@

using System.Collections.Generic;
using System.Globalization;
using System.Net.NetworkInformation;
using System.Text.Encodings.Web;
using System.Web;
using System.Xml.Linq;
using HtmlAgilityPack;
using OpenQA.Selenium;
using OpenQA.Selenium.BiDi.Modules.Script;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.DevTools.V127.Target;
using OpenQA.Selenium.Support.UI;
using UWLib;
using static Microsoft.EntityFrameworkCore.DbLoggerCategory;
@ -13,110 +17,287 @@ using static Microsoft.EntityFrameworkCore.DbLoggerCategory;
internal class Program
{
readonly static List<string> checkedUrls = [];
readonly static List<string> urlsToCheck = [];
static List<string> checkedUrls = new List<string>();
static List<string> urlsToCheck = new List<string>();
static bool refreshLectures = false;
static LectureContext db = new LectureContext();
static IWebDriver driver = null;
static readonly LectureContext db = new();
static ChromeDriver? driver = null;
private static void Main(string[] args)
{
AppDomain.CurrentDomain.ProcessExit += (s, e) =>
{
Done();
};
var service = ChromeDriverService.CreateDefaultService();
service.HideCommandPromptWindow = true;
var options = new ChromeOptions();
//options.AddArgument("--headless");
driver = new ChromeDriver(options);
driver = new ChromeDriver(service, options);
checkedUrls.AddRange(db.Lectures.Select(x => x.Url).ToList());
if (args.Any(x => x == "-r" || x == "--refresh-lectures"))
{
refreshLectures = true;
Console.WriteLine("Refreshing lectures");
}
if (args.Length == 0)
{
urlsToCheck.AddRange([.. db.LinksToScrape.Select(x => x.Url)]);
//add all urls that were checked in the last 24 hours
checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-1)).Select(x => x.Url)]);
//remove all checked urls from the urls to check
foreach (var url in checkedUrls)
{
urlsToCheck.Remove(url);
}
if (urlsToCheck.Count == 0)
{
// add the first url to check / Vorlesungsverzeichnis
urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html");
}
}
if (refreshLectures)
{
urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]);
}
Console.CursorVisible = false;
int top = Console.CursorTop;
while (urlsToCheck.Count > 0)
{
var url = urlsToCheck.First();
try
{
FindUrls(url);
}
catch (Exception e)
{
Console.Clear();
Console.WriteLine($"Url: {url}");
Console.WriteLine();
Console.WriteLine(e);
return;
}
Console.CursorLeft = 0;
Console.CursorTop = top;
Console.WriteLine($"Urls checked: {checkedUrls.Count}");
Console.WriteLine($"Urls to check: {urlsToCheck.Count}");
}
Console.WriteLine("Done");
}
private static void Done()
{
driver?.Quit();
Console.CursorVisible = true;
Console.ReadLine();
}
static string GetAbsoluteUrl(string relativeUrl, string baseUrl)
{
var uri = new Uri(baseUrl);
var baseUri = new Uri(uri, relativeUrl);
return HttpUtility.HtmlDecode(baseUri.AbsoluteUri);
}
private static void FindUrls(string source)
{
if (checkedUrls.Contains(source))
{
urlsToCheck.Remove(source);
return;
}
driver?.Navigate().GoToUrl(source);
if (db.Lectures.Any(x=>x.Url == source))
{
checkedUrls.Add(source);
urlsToCheck.Remove(source);
return;
}
driver.Navigate().GoToUrl(source);
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(2));
// wait for the page to load
WebDriverWait wait = new(driver, TimeSpan.FromSeconds(2));
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
);
HtmlDocument doc = new();
doc.LoadHtml(driver?.PageSource);
var root = doc.DocumentNode;
if (source.Contains("course.html"))
{
Lecture lecture = new Lecture();
CreateLecture(source, root);
}
lecture.Url = source;
if (!refreshLectures)
{
FindPathLinks(source, root);
FindCourseLinks(source, root);
}
RefreshScrapedLink(source);
RemoveLinkToScrape(source);
db.SaveChanges();
checkedUrls.Add(source);
urlsToCheck.Remove(source);
}
private static void RefreshScrapedLink(string source)
{
var scrapedLink = db.ScrapedLinks.Find(source);
if (scrapedLink == null)
{
scrapedLink = new ScrapedLink { Url = source };
db.ScrapedLinks.Add(scrapedLink);
}
scrapedLink.LastScrape = DateTime.Now;
}
private static void FindCourseLinks(string source, HtmlNode root)
{
var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/..");
if (courseLinkParents != null)
{
foreach (var parent in courseLinkParents)
{
if (parent.SelectSingleNode("abbr[contains(@title,'Vorlesung')]") != null)
{
var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]");
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url))
{
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = 0 });
urlsToCheck.Insert(0, url);
}
}
}
}
}
private static void FindPathLinks(string source, HtmlNode root)
{
var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]");
if (pathLinks != null)
{
foreach (var link in pathLinks)
{
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url))
{
int sort = 1;
AddLinkToScrape(url, sort);
urlsToCheck.Add(url);
}
}
}
}
private static void RemoveLinkToScrape(string url)
{
var link = db.LinksToScrape.Find(url);
if (link != null)
{
db.LinksToScrape.Remove(link);
}
}
private static void AddLinkToScrape(string url, int sort)
{
var link = db.LinksToScrape.Find(url);
if (link == null)
{
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort });
}
}
private static void CreateLecture(string source, HtmlNode root)
{
var branch = root.SelectSingleNode("/html/body/main/div[1]/div[1]/a");
var what = root.SelectSingleNode("//h1/*/*[@class='what']");
var when = root.SelectSingleNode("//h1/*/*[@class='when']");
var info = root.SelectSingleNode("//*[@class='info list']");
var events = root.SelectNodes("//ul[@class='classes events list']/li");
var uri = new Uri(source);
var query = HttpUtility.ParseQueryString(uri.Query);
Lecture? lecture = db.Lectures.FirstOrDefault(db => db.Url == source);
if (lecture == null)
{
lecture = new Lecture();
if (query.AllKeys.Contains("lv"))
{
lecture.Id = int.Parse(query["lv"]);
lecture.Id = int.Parse(query["lv"] ?? "0");
}
int year = DateTime.Now.Year;
var what = driver.FindElements(By.XPath("//h1/*/*[@class='what']"));
var when = driver.FindElements(By.XPath("//h1/*/*[@class='when']"));
var info = driver.FindElements(By.XPath("//*[@class='info list']"));
var events = driver.FindElements(By.XPath("//ul[@class='classes events list']/li"));
if (what.Count > 0)
if (when != null)
{
lecture.Title = what.First().Text;
lecture.Semester = when.InnerText;
}
lecture.Url = source;
if (when.Count > 0)
db.Lectures.Add(lecture);
}
if (branch != null)
{
lecture.Semester = when.First().Text;
year = int.Parse(when.First().Text.Substring(0, 4));
lecture.Branch = branch.InnerText;
}
if (info.Count > 0)
if (what != null)
{
lecture.Description = info.First().GetAttribute("innerHTML");
lecture.Title = what.InnerText;
}
if (info != null)
{
lecture.Description = info.InnerHtml;
}
if (events != null)
{
CreateLectureEvents(events, lecture);
}
}
private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture)
{
int year = int.Parse(lecture.Semester[..4]);
db.RemoveRange(lecture.Events);
lecture.Events.Clear();
foreach (var item in events)
{
LectureEvent lectureEvent = new LectureEvent();
LectureEvent lectureEvent = new();
var day = item.FindElements(By.XPath("*[@class='date']"));
var time = item.FindElements(By.XPath("*[@class='time']"));
var room = item.FindElements(By.XPath("*[@class='room']"));
DateTime date = new DateTime();
if (day.Count > 0)
var day = item.SelectSingleNode("*[@class='date']");
var time = item.SelectSingleNode("*[@class='time']");
var room = item.SelectSingleNode("*[@class='room']");
DateTime date = new();
if (day != null)
{
date = DateTime.ParseExact(day.First().Text + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
}
if (time.Count > 0)
if (time != null)
{
var text = time.First().Text;
var text = time.InnerText;
var times = text.Split(" - ");
@ -127,56 +308,12 @@ internal class Program
lectureEvent.To = date.Add(to);
}
if (room.Count > 0)
if (room != null)
{
lectureEvent.Location = room.First().Text;
lectureEvent.Location = room.InnerText;
}
lecture.Events.Add(lectureEvent);
}
try
{
db.Lectures.Add(lecture);
}
catch (Exception e)
{
}
db.SaveChanges();
}
var pathLinks = driver.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]"));
foreach (var link in pathLinks)
{
var url = link.GetAttribute("href");
if (!checkedUrls.Contains(url))
{
urlsToCheck.Add(link.GetAttribute("href"));
}
}
var courseLinkParents = driver.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]/.."));
foreach (var parent in courseLinkParents)
{
if (parent.FindElements(By.XPath("abbr[@title='Vorlesung']")).Count > 0)
{
var link = parent.FindElement(By.XPath("a[starts-with(@href, 'course.html')]"));
var url = link.GetAttribute("href");
if (!checkedUrls.Contains(url))
{
urlsToCheck.Insert(0, link.GetAttribute("href"));
}
}
}
// driver.Close();
checkedUrls.Add(source);
urlsToCheck.Remove(source);
}
}

View File

@ -0,0 +1,8 @@
{
"profiles": {
"UWScraper": {
"commandName": "Project",
"commandLineArgs": "-r"
}
}
}

View File

@ -9,6 +9,10 @@
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.67" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="8.0.10">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Selenium.WebDriver" Version="4.25.0" />
</ItemGroup>

BIN
UWScraper/lecture.db Normal file

Binary file not shown.