Skip to content

Commit

Permalink
Merge pull request #25 from henrikwidlund/feature/refactor_with_docs
Browse files Browse the repository at this point in the history
Improve readme and documentation
  • Loading branch information
henrikwidlund authored Aug 3, 2021
2 parents 901122b + 81382b2 commit 2d8479c
Show file tree
Hide file tree
Showing 15 changed files with 273 additions and 143 deletions.
10 changes: 5 additions & 5 deletions HostsParser.Benchmarks/BenchmarkCollectionUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ public abstract class BenchmarkCollectionUtilitiesBase : BenchmarkStreamBase
public IEnumerable<HashSet<string>> Source()
{
var stream = PrepareStream();
var source = HostUtilities
.ProcessSource(stream, BenchmarkTestData.Settings.SkipLinesBytes,
var hostsBasedLines = HostUtilities
.ProcessHostsBased(stream, BenchmarkTestData.Settings.HostsBased.SkipLinesBytes,
BenchmarkTestData.Decoder).GetAwaiter().GetResult();

stream = PrepareStream();
var adGuard = HostUtilities.ProcessAdGuard(stream, BenchmarkTestData.Decoder)
var adBlockBasedLines = HostUtilities.ProcessAdBlockBased(stream, BenchmarkTestData.Decoder)
.GetAwaiter().GetResult();

stream.Dispose();

source.UnionWith(adGuard);
hostsBasedLines.UnionWith(adBlockBasedLines);

yield return source;
yield return hostsBasedLines;
}
}
}
28 changes: 14 additions & 14 deletions HostsParser.Benchmarks/BenchmarkHostUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace HostsParser.Benchmarks
{
[MemoryDiagnoser]
[BenchmarkCategory(nameof(HostUtilities))]
public class BenchmarkProcessSource : BenchmarkStreamBase
public class BenchmarkProcessHostsBased : BenchmarkStreamBase
{
private Stream _stream;

Expand All @@ -21,18 +21,18 @@ public class BenchmarkProcessSource : BenchmarkStreamBase
public void IterationCleanup() => _stream?.Dispose();

[Benchmark]
[BenchmarkCategory(nameof(ProcessSource), nameof(HostUtilities))]
public async Task<HashSet<string>> ProcessSource()
[BenchmarkCategory(nameof(ProcessHostsBased), nameof(HostUtilities))]
public async Task<HashSet<string>> ProcessHostsBased()
{
return await HostUtilities.ProcessSource(_stream,
BenchmarkTestData.Settings.SkipLinesBytes,
return await HostUtilities.ProcessHostsBased(_stream,
BenchmarkTestData.Settings.HostsBased.SkipLinesBytes,
BenchmarkTestData.Decoder);
}
}

[MemoryDiagnoser]
[BenchmarkCategory(nameof(HostUtilities))]
public class BenchmarkProcessAdGuard : BenchmarkStreamBase
public class BenchmarkProcessAdBlockBased : BenchmarkStreamBase
{
private Stream _stream;

Expand All @@ -43,9 +43,9 @@ public class BenchmarkProcessAdGuard : BenchmarkStreamBase
public void IterationCleanup() => _stream?.Dispose();

[Benchmark]
[BenchmarkCategory(nameof(ProcessAdGuard), nameof(HostUtilities))]
public async Task<HashSet<string>> ProcessAdGuard()
=> await HostUtilities.ProcessAdGuard(_stream, BenchmarkTestData.Decoder);
[BenchmarkCategory(nameof(ProcessAdBlockBased), nameof(HostUtilities))]
public async Task<HashSet<string>> ProcessAdBlockBased()
=> await HostUtilities.ProcessAdBlockBased(_stream, BenchmarkTestData.Decoder);
}

[MemoryDiagnoser]
Expand All @@ -61,18 +61,18 @@ public void RemoveKnownBadHosts(HashSet<string> data)
public IEnumerable<HashSet<string>> Source()
{
var stream = PrepareStream();
var source = HostUtilities
.ProcessSource(stream, BenchmarkTestData.Settings.SkipLinesBytes,
var hostsBasedLines = HostUtilities
.ProcessHostsBased(stream, BenchmarkTestData.Settings.HostsBased.SkipLinesBytes,
BenchmarkTestData.Decoder).GetAwaiter().GetResult();

stream = PrepareStream();
var adGuard = HostUtilities.ProcessAdGuard(stream, BenchmarkTestData.Decoder)
var adBlockBasedLines = HostUtilities.ProcessAdBlockBased(stream, BenchmarkTestData.Decoder)
.GetAwaiter().GetResult();

stream.Dispose();

source.UnionWith(adGuard);
yield return source;
hostsBasedLines.UnionWith(adBlockBasedLines);
yield return hostsBasedLines;
}
}
}
4 changes: 2 additions & 2 deletions HostsParser.Benchmarks/BenchmarkStreamBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ protected static Stream PrepareStream()
var stream = new MemoryStream();

using var sw = new BinaryWriter(stream, Encoding.UTF8, true);
sw.Write(BenchmarkTestData.SourceTestBytes);
sw.Write(BenchmarkTestData.AdGuardTestBytes);
sw.Write(BenchmarkTestData.HostsBasedTestBytes);
sw.Write(BenchmarkTestData.AdBlockBasedTestBytes);
sw.Flush();

stream.Seek(0, SeekOrigin.Begin);
Expand Down
4 changes: 2 additions & 2 deletions HostsParser.Benchmarks/BenchmarkTestData.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ namespace HostsParser.Benchmarks
{
internal static class BenchmarkTestData
{
public static readonly byte[] SourceTestBytes = File.ReadAllBytes("sourcehosts.txt");
public static readonly byte[] AdGuardTestBytes = File.ReadAllBytes("adguardhosts.txt");
public static readonly byte[] HostsBasedTestBytes = File.ReadAllBytes("hostsbased.txt");
public static readonly byte[] AdBlockBasedTestBytes = File.ReadAllBytes("adbockbased.txt");

public static readonly Settings Settings =
JsonSerializer.Deserialize<Settings>(File.ReadAllBytes("appsettings.json"));
Expand Down
4 changes: 2 additions & 2 deletions HostsParser.Benchmarks/HostsParser.Benchmarks.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
</ItemGroup>

<ItemGroup>
<None Update="adguardhosts.txt">
<None Update="adbockbased.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="sourcehosts.txt">
<None Update="hostsbased.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="appsettings.json">
Expand Down
File renamed without changes.
44 changes: 24 additions & 20 deletions HostsParser.Benchmarks/appsettings.json
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
{
"SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts",
"AdGuardUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt",
"HostsBased": {
"SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts",
"SkipLines": [
"127.0.0.1 localhost",
"127.0.0.1 localhost.localdomain",
"127.0.0.1 local",
"255.255.255.255 broadcasthost",
"::1 localhost",
"::1 ip6-localhost",
"::1 ip6-loopback",
"fe80::1%lo0 localhost",
"ff00::0 ip6-localnet",
"ff00::0 ip6-mcastprefix",
"ff02::1 ip6-allnodes",
"ff02::2 ip6-allrouters",
"ff02::3 ip6-allhosts",
"0.0.0.0 0.0.0.0",
"0.0.0.0 fe #00::0 ip6-localnet",
"0.0.0.0 ff #00::0 ip6-mcastprefix"
]
},
"AdBlockBased": {
"SourceUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt"
},
"ExtraFiltering": false,
"SkipLines": [
"127.0.0.1 localhost",
"127.0.0.1 localhost.localdomain",
"127.0.0.1 local",
"255.255.255.255 broadcasthost",
"::1 localhost",
"::1 ip6-localhost",
"::1 ip6-loopback",
"fe80::1%lo0 localhost",
"ff00::0 ip6-localnet",
"ff00::0 ip6-mcastprefix",
"ff02::1 ip6-allnodes",
"ff02::2 ip6-allrouters",
"ff02::3 ip6-allhosts",
"0.0.0.0 0.0.0.0",
"0.0.0.0 fe #00::0 ip6-localnet",
"0.0.0.0 ff #00::0 ip6-mcastprefix"
],
"HeaderLines": [
"! Copyright Henrik Widlund https://github.com/henrikwidlund/HostsParser/blob/main/LICENSE",
"! All content below commented lines are based on StevenBlack/hosts and AdGuard DNS filter.",
Expand Down
File renamed without changes.
35 changes: 24 additions & 11 deletions HostsParser/CollectionUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ namespace HostsParser
{
internal static class CollectionUtilities
{
internal static List<string> SortDnsList(ICollection<string> dnsList)
/// <summary>
/// Sorts <paramref name="dnsCollection"/> by domain and length.
/// </summary>
/// <param name="dnsCollection">The collection to sort.</param>
internal static List<string> SortDnsList(ICollection<string> dnsCollection)
{
List<string> list = new(dnsList.Count);
list.AddRange(dnsList
List<string> list = new(dnsCollection.Count);
list.AddRange(dnsCollection
.Select(d => new StringSortItem(d))
.OrderBy(l => GetTopMostDns(l.RawMemory), ReadOnlyMemoryCharComparer.Default)
.ThenBy(l => l.RawMemory.Length)
Expand All @@ -22,12 +26,15 @@ internal static List<string> SortDnsList(ICollection<string> dnsList)
return list;
}

internal static void FilterGrouped(HashSet<string> dnsList)
/// <summary>
/// Filters out all sub domains from <paramref name="dnsCollection"/> for which a domain is contained.
/// </summary>
internal static void FilterGrouped(HashSet<string> dnsCollection)
{
var cacheHashSet = CreateCacheHashSet(dnsList);
var cacheHashSet = CreateCacheHashSet(dnsCollection);

var dnsGroups = GroupDnsList(dnsList);
HashSet<string> filtered = new(dnsList.Count);
var dnsGroups = GroupDnsList(dnsCollection);
HashSet<string> filtered = new(dnsCollection.Count);
foreach (var (key, value) in dnsGroups)
{
if (!cacheHashSet.Contains(key)
Expand All @@ -43,13 +50,19 @@ internal static void FilterGrouped(HashSet<string> dnsList)
}
}

dnsList.ExceptWith(filtered);
dnsCollection.ExceptWith(filtered);
}

internal static Dictionary<int, List<string>> GroupDnsList(HashSet<string> dnsList)
/// <summary>
/// Groups <paramref name="dnsCollection"/> into a dictionary where the key is the main domain
/// and value is a list of found sub domains.
/// </summary>
/// <param name="dnsCollection">The collection used for grouping.</param>
/// <returns></returns>
internal static Dictionary<int, List<string>> GroupDnsList(HashSet<string> dnsCollection)
{
var dict = new Dictionary<int, List<string>>(dnsList.Count);
foreach (var s in dnsList)
var dict = new Dictionary<int, List<string>>(dnsCollection.Count);
foreach (var s in dnsCollection)
{
var key = string.GetHashCode(GetTopMostDns(s));
List<string> values;
Expand Down
45 changes: 34 additions & 11 deletions HostsParser/HostUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,43 @@ internal static class HostUtilities
{
private static readonly Memory<char> Cache = new char[256];

internal static async Task<HashSet<string>> ProcessSource(Stream stream,
byte[][] skipLines,
/// <summary>
/// Reads the <paramref name="stream"/> and returns a collection based on the items in it.
/// </summary>
/// <param name="stream">The <see cref="Stream"/> to process.</param>
/// <param name="skipLines">The lines that should be excluded from the returned result.</param>
/// <param name="decoder">The <see cref="Decoder"/> used when converting the bytes in <paramref name="stream"/>.</param>
internal static async Task<HashSet<string>> ProcessHostsBased(Stream stream,
byte[][]? skipLines,
Decoder decoder)
{
var pipeReader = PipeReader.Create(stream);
// Assumed length to reduce allocations
var dnsList = new HashSet<string>(140_000);
await ReadPipeAsync(pipeReader, dnsList, skipLines, decoder);
return dnsList;
}

internal static async Task<HashSet<string>> ProcessAdGuard(Stream stream,
/// <summary>
/// Reads the <paramref name="stream"/> and returns a collection based on the items in it.
/// </summary>
/// <param name="stream">The <see cref="Stream"/> to process.</param>
/// <param name="decoder">The <see cref="Decoder"/> used when converting the bytes in <paramref name="stream"/>.</param>
internal static async Task<HashSet<string>> ProcessAdBlockBased(Stream stream,
Decoder decoder)
{
var pipeReader = PipeReader.Create(stream);
// Assumed length to reduce allocations
var dnsList = new HashSet<string>(50_000);
await ReadPipeAsync(pipeReader, dnsList, null, decoder);
return dnsList;
}

/// <summary>
/// Removes all sub domains to the entries in <paramref name="knownBadHosts"/> from the <paramref name="hosts"/>.
/// </summary>
/// <param name="knownBadHosts">Array of hosts used for removing sub domains.</param>
/// <param name="hosts">The collection of hosts that sub domains should be removed from.</param>
internal static HashSet<string> RemoveKnownBadHosts(string[] knownBadHosts,
HashSet<string> hosts)
{
Expand All @@ -58,6 +76,11 @@ internal static HashSet<string> RemoveKnownBadHosts(string[] knownBadHosts,
return hosts;
}

/// <summary>
/// Checks if <paramref name="potentialSubDomain"/> is a sub domain of <paramref name="potentialDomain"/>.
/// </summary>
/// <param name="potentialSubDomain">The potential sub domain.</param>
/// <param name="potentialDomain">The potential domain.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsSubDomainOf(in ReadOnlySpan<char> potentialSubDomain,
in ReadOnlySpan<char> potentialDomain)
Expand Down Expand Up @@ -109,12 +132,12 @@ private static void ProcessLine(in ReadOnlySequence<byte> slice,
Decoder decoder)
{
if (skipLines == null)
ProcessAdGuardLine(slice, resultCollection, decoder);
ProcessAdBlockBasedLine(slice, resultCollection, decoder);
else
ProcessSourceLine(slice, resultCollection, skipLines, decoder);
ProcessHostsBasedLine(slice, resultCollection, skipLines, decoder);
}

private static void ProcessSourceLine(in ReadOnlySequence<byte> slice,
private static void ProcessHostsBasedLine(in ReadOnlySequence<byte> slice,
ICollection<string> resultCollection,
byte[][] skipLines,
Decoder decoder)
Expand All @@ -128,7 +151,7 @@ private static void ProcessSourceLine(in ReadOnlySequence<byte> slice,
if (realSlice[0] == Constants.HashSign)
return;

if (SourceShouldSkipLine(realSlice, skipLines))
if (HostsBasedShouldSkipLine(realSlice, skipLines))
return;

realSlice = HandleWwwPrefix(realSlice);
Expand All @@ -140,7 +163,7 @@ private static void ProcessSourceLine(in ReadOnlySequence<byte> slice,
resultCollection.Add(Cache.Span[..realSlice.Length].Trim().ToString());
}

private static void ProcessAdGuardLine(in ReadOnlySequence<byte> slice,
private static void ProcessAdBlockBasedLine(in ReadOnlySequence<byte> slice,
ICollection<string> resultCollection,
Decoder decoder)
{
Expand All @@ -150,7 +173,7 @@ private static void ProcessAdGuardLine(in ReadOnlySequence<byte> slice,
if (realSlice.IsEmpty)
return;

if (AdGuardShouldSkipLine(realSlice))
if (AdBlockBasedShouldSkipLine(realSlice))
return;

realSlice = HandlePipe(realSlice);
Expand All @@ -162,7 +185,7 @@ private static void ProcessAdGuardLine(in ReadOnlySequence<byte> slice,
resultCollection.Add(Cache.Span[..realSlice.Length].ToString());
}

private static bool SourceShouldSkipLine(in ReadOnlySpan<byte> bytes,
private static bool HostsBasedShouldSkipLine(in ReadOnlySpan<byte> bytes,
byte[][] skipLines)
{
if (TrimStart(bytes)[0] == Constants.HashSign)
Expand All @@ -178,7 +201,7 @@ private static bool SourceShouldSkipLine(in ReadOnlySpan<byte> bytes,
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool AdGuardShouldSkipLine(in ReadOnlySpan<byte> current)
private static bool AdBlockBasedShouldSkipLine(in ReadOnlySpan<byte> current)
=> current[0] != Constants.PipeSign;

private static ReadOnlySpan<byte> TrimStart(in this ReadOnlySpan<byte> span)
Expand Down
Loading

0 comments on commit 2d8479c

Please sign in to comment.