Skip to content

Commit

Permalink
improved title detection
Browse files Browse the repository at this point in the history
  • Loading branch information
merlinschumacher committed Jun 3, 2024
1 parent 1cd976f commit 27b280a
Show file tree
Hide file tree
Showing 17 changed files with 465 additions and 63 deletions.
7 changes: 5 additions & 2 deletions Data/KinohannoverContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace kinohannover.Data
public class KinohannoverContext(DbContextOptions<KinohannoverContext> options) : DbContext(options)
{
public DbSet<Models.Movie> Movies { get; set; } = default!;
public DbSet<Models.Alias> Aliases { get; set; } = default!;

public DbSet<Models.Cinema> Cinema { get; set; } = default!;

Expand All @@ -14,8 +15,10 @@ protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.UseCollation("NOCASE");

modelBuilder.Entity<Models.Movie>(m => m.Property(x => x.DisplayName).UseCollation("NOCASE"));
modelBuilder.Entity<Models.Alias>(m => m.Property(x => x.Value).UseCollation("NOCASE"));
modelBuilder.Entity<Models.Movie>(m => m.Property(n => n.DisplayName).UseCollation("NOCASE"));
modelBuilder.Entity<Models.Movie>(m => m.Navigation(n => n.Aliases).AutoInclude());
modelBuilder.Entity<Models.Movie>(m => m.HasMany(n => n.Aliases).WithOne(a => a.Movie).OnDelete(DeleteBehavior.Cascade));
modelBuilder.Entity<Models.Alias>(m => m.Property(n => n.Value).UseCollation("NOCASE"));

base.OnModelCreating(modelBuilder);
}
Expand Down
111 changes: 70 additions & 41 deletions Helpers/MovieTitleHelper.cs
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
using System.Globalization;
using System.Text.RegularExpressions;
using TMDbLib.Objects.Movies;

namespace kinohannover.Helpers
{
internal static class MovieTitleHelper
internal static partial class MovieTitleHelper
{
private static readonly char[] _dashCharacters = ['-', '֊', '־', '᐀', '᠆', '‐', '‑', '‒', '–', '—', '―', '⸗', '⸚', '⸺', '⸻', '⹀', '⹝', '〜', '〰', '゠', '︱', '︲', '﹘', '﹣', '-'];
private static readonly char[] _delimiterCharacters = [':', '(', ')', '[', ']', '{', '}', '<', '>', '|', '/', '\\', '!', '?', '.', ',', ';', ' ', '\t', '\n', '\r'];

Check warning on line 9 in Helpers/MovieTitleHelper.cs

View workflow job for this annotation

GitHub Actions / build

Remove the unused private field '_delimiterCharacters'. (https://rules.sonarsource.com/csharp/RSPEC-1144)
private const string _translationConst = "Translation";

public static string DetermineMovieTitle(string title, TMDbLib.Objects.Movies.Movie tmdbMovieDetails, bool guessHarder = true)
{
title = NormalizeTitle(title);
var matchedTitle = GetTitleFromTmdbData(title, tmdbMovieDetails);
if (matchedTitle is not null)
{
Expand Down Expand Up @@ -43,54 +42,52 @@ public static string DetermineMovieTitle(string title, TMDbLib.Objects.Movies.Mo

private static string? GetTitleFromTmdbData(string title, Movie tmdbMovieDetails)
{
if (tmdbMovieDetails.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase))
if (tmdbMovieDetails.OriginalLanguage.Equals("DE", StringComparison.OrdinalIgnoreCase))
{
return tmdbMovieDetails.Title;
return tmdbMovieDetails.OriginalTitle.NormalizeDashes();
}

if (tmdbMovieDetails.OriginalTitle.Equals(title, StringComparison.CurrentCultureIgnoreCase))
var tmdbTitle = GetAlternativeTitle(tmdbMovieDetails, "DE");
if (tmdbTitle is not null)
{
return tmdbMovieDetails.OriginalTitle;
return tmdbTitle;
}

if (tmdbMovieDetails.OriginalLanguage.Equals("DE", StringComparison.OrdinalIgnoreCase))
tmdbTitle = tmdbMovieDetails.Title.NormalizeDashes();
if (tmdbTitle.Equals(title, StringComparison.CurrentCultureIgnoreCase))
{
return tmdbMovieDetails.OriginalTitle;
return tmdbMovieDetails.Title.NormalizeDashes();
}

var matchingAltTitle = tmdbMovieDetails.AlternativeTitles.Titles.Find(e => e.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase))?.Title;
if (matchingAltTitle is not null)
tmdbTitle = tmdbMovieDetails.OriginalTitle.NormalizeDashes();
if (tmdbTitle.Equals(title, StringComparison.CurrentCultureIgnoreCase))
{
return matchingAltTitle;
return tmdbMovieDetails.OriginalTitle;
}
matchingAltTitle = GetAlternativeTitle(tmdbMovieDetails, "DE");
if (matchingAltTitle is not null)

var altTitle = tmdbMovieDetails.AlternativeTitles.Titles.Find(e => e.Title.NormalizeDashes().Equals(title, StringComparison.CurrentCultureIgnoreCase))?.Title;
if (altTitle is not null)
{
return matchingAltTitle;
return altTitle;
}
matchingAltTitle = GetAlternativeTitle(tmdbMovieDetails, "EN");
return matchingAltTitle is not null ? matchingAltTitle : null;
altTitle = GetAlternativeTitle(tmdbMovieDetails, "EN");
return altTitle is not null ? altTitle : null;
}

public static string NormalizeTitle(string title)
{
title = title.Normalize().Trim();
title = title.Normalize().NormalizeDashes();

foreach (var dash in _dashCharacters)
{
title = title.Trim(dash);
}
title = ReplaceMultipleSpacesRegex().Replace(title, " ");

foreach (var delim in _delimiterCharacters)
{
title = title.Trim(delim);
}
title = ReplaceParenthesisAttributeRegex().Replace(title, " ");

title = title.Replace("OmU", "", StringComparison.CurrentCultureIgnoreCase).Trim();
title = title.Replace("OV", "", StringComparison.CurrentCultureIgnoreCase).Trim();

// Avoid adding movies with only uppercase letters, as this is usually a sign of a bad title. Make them title case instead.
var upperCasePercentage = title.Count(c => char.IsLetter(c) && char.IsUpper(c)) / (double)title.Length;
#pragma warning disable CA1862 // We explcitly want to check for upper here.
var upperCaseWords = title.Split(' ').Count(e => e.ToUpper(CultureInfo.CurrentCulture) == e);
#pragma warning restore CA1862 // Use the 'StringComparison' method overloads to perform case-insensitive string comparisons
var words = title.Split(' ');
var upperCaseWords = words.Count(e => !LatinNumeralRegex().IsMatch(e) && e.Where(e => char.IsLetter(e)).All(char.IsUpper));
if (upperCasePercentage > 0.7)
{
return ToTitleCase(title);
Expand All @@ -100,7 +97,7 @@ public static string NormalizeTitle(string title)
return ToTitleCase(title);
}

return title;
return title.Trim();
}

private static string ToTitleCase(string title)
Expand All @@ -111,16 +108,7 @@ private static string ToTitleCase(string title)

public static string? GetMostSimilarTitle(IEnumerable<string> haystack, string needle)
{
Fastenshtein.Levenshtein lev = new(needle);
var needleLength = needle.Length;

var mostSimilarList = haystack.Select(e =>
{
var dist = lev.DistanceFrom(e);
var bigger = Math.Max(needleLength, e.Length);
var distPercent = (double)(bigger - dist) / bigger;
return (altTitle: e, index: distPercent);
});
var mostSimilarList = haystack.Select(e => (altTitle: e, index: needle.DistancePercentageFrom(e)));
return mostSimilarList.FirstOrDefault(e => e.index > 0.7).altTitle;
}

Expand All @@ -136,5 +124,46 @@ private static string ToTitleCase(string title)
}
return null;
}

[GeneratedRegex(@"\s{2,}")]
private static partial Regex ReplaceMultipleSpacesRegex();

[GeneratedRegex(@"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")]
private static partial Regex LatinNumeralRegex();

[GeneratedRegex(@"\(.*\)")]
private static partial Regex ReplaceParenthesisAttributeRegex();
}

public static class StringExtensions
{
private static readonly char[] _dashCharacters = ['-', '֊', '־', '᐀', '᠆', '‐', '‑', '‒', '–', '—', '―', '⸗', '⸚', '⸺', '⸻', '⹀', '⹝', '〜', '〰', '゠', '︱', '︲', '﹘', '﹣', '-'];

public static string NormalizeDashes(this string s)

{
foreach (var dash in _dashCharacters)
{
s = s.Trim(dash);
s = s.Replace(dash, '–');
}

return s;
}

public static double DistancePercentageFrom(this string s, string c, bool caseInsensitive = false)
{
if (caseInsensitive)
{
s = s.ToLower();
c = c.ToLower();
}
Fastenshtein.Levenshtein lev = new(c);
var needleLength = c.Length;

var dist = lev.DistanceFrom(s);
var bigger = Math.Max(needleLength, s.Length);
return (double)(bigger - dist) / bigger;
}
}
}
Loading

0 comments on commit 27b280a

Please sign in to comment.