Skip to content

Commit

Permalink
Fix #5
Browse files Browse the repository at this point in the history
In parse mode – i.e. when using the `WarcParser` - any subclass of `Record` that has a `WARC-Identified-Payload-Type` will be created with its `IdentifiedPayloadType` set to the parsed value, if any.

In non parse mode – i.e. when instantiating each of those subclass using their respective constructor(s) – the ` IdentifiedPayloadType` will be set to the value of `PayloadTypeIdentifier.Identify(…)`.

This bug is due to an oversight as it should have operated the same way as `Record.BlockDigest`, which implemented the conditional check from the get-go.
  • Loading branch information
nurhafiz committed May 4, 2023
1 parent 2ae8b5e commit 2345d2d
Show file tree
Hide file tree
Showing 20 changed files with 362 additions and 138 deletions.
17 changes: 17 additions & 0 deletions src/Toimik.WarcProtocol/Records/ContinuationRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ namespace Toimik.WarcProtocol;

public class ContinuationRecord : Record
{
public const string FieldForIdentifiedPayloadType = "warc-identified-payload-type";

public const string FieldForInfoId = "warc-warcinfo-id";

public const string FieldForPayloadDigest = "warc-payload-digest";
Expand Down Expand Up @@ -51,6 +53,7 @@ public class ContinuationRecord : Record
FieldForSegmentOriginId,
FieldForSegmentNumber,
FieldForSegmentTotalLength,
FieldForIdentifiedPayloadType,
};

public ContinuationRecord(
Expand All @@ -62,6 +65,7 @@ public ContinuationRecord(
Uri segmentOriginId,
int segmentNumber,
int? segmentTotalLength = null,
string? identifiedPayloadType = null,
string? truncatedReason = null,
DigestFactory? digestFactory = null)
: this(
Expand All @@ -75,6 +79,7 @@ public ContinuationRecord(
segmentOriginId,
segmentNumber,
segmentTotalLength,
identifiedPayloadType,
truncatedReason,
digestFactory)
{
Expand All @@ -91,6 +96,7 @@ public ContinuationRecord(
Uri segmentOriginId,
int segmentNumber,
int? segmentTotalLength = null,
string? identifiedPayloadType = null,
string? truncatedReason = null,
DigestFactory? digestFactory = null)
: base(
Expand All @@ -106,6 +112,7 @@ public ContinuationRecord(

// REMINDER: This is not auto-generated because it must be identical to the source's
PayloadDigest = payloadDigest;
IdentifiedPayloadType ??= identifiedPayloadType;
InfoId = infoId;
TargetUri = targetUri;
SegmentOriginId = segmentOriginId;
Expand All @@ -127,6 +134,8 @@ internal ContinuationRecord(
{
}

public string? IdentifiedPayloadType { get; private set; }

public Uri? InfoId { get; private set; }

public string? PayloadDigest { get; private set; }
Expand Down Expand Up @@ -156,6 +165,10 @@ protected internal override void Set(string field, string value)
{
switch (field.ToLower())
{
case FieldForIdentifiedPayloadType:
IdentifiedPayloadType = value;
break;

case FieldForInfoId:
InfoId = Utils.RemoveBracketsFromUri(value);
break;
Expand Down Expand Up @@ -205,6 +218,10 @@ protected internal override void Set(string field, string value)
text = $"WARC-Date: {Utils.FormatDate(Date)}{WarcParser.CrLf}";
break;

case FieldForIdentifiedPayloadType:
text = ToString("WARC-Identified-Payload-Type", IdentifiedPayloadType);
break;

case FieldForInfoId:
text = ToString("WARC-Warcinfo-ID", Utils.AddBracketsToUri(InfoId));
break;
Expand Down
15 changes: 12 additions & 3 deletions src/Toimik.WarcProtocol/Records/ConversionRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ public ConversionRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
Uri? refersTo = null,
bool isSegmented = false,
string? truncatedReason = null,
Expand All @@ -78,6 +79,7 @@ public ConversionRecord(
infoId,
targetUri,
payloadDigest,
identifiedPayloadType,
refersTo,
isSegmented,
truncatedReason,
Expand All @@ -95,6 +97,7 @@ public ConversionRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
Uri? refersTo = null,
bool isSegmented = false,
string? truncatedReason = null,
Expand All @@ -113,6 +116,7 @@ public ConversionRecord(
SetContentBlock(recordBlock, isParsed);

PayloadDigest = payloadDigest;
IdentifiedPayloadType ??= identifiedPayloadType;
if (recordBlock.Length > 0)
{
ContentType = contentType;
Expand Down Expand Up @@ -170,19 +174,24 @@ internal override void SetContentBlock(byte[] contentBlock, bool isParsed = true
{
base.SetContentBlock(contentBlock, isParsed);
RecordBlock = contentBlock;
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(RecordBlock);
if (!isParsed)
{
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(RecordBlock);
}
}

protected internal override void Set(string field, string value)
{
// NOTE: FieldForIdentifiedPayloadType, if any, is ignored because it is supposed to be
// auto detected when the content block is set
switch (field.ToLower())
{
case FieldForContentType:
ContentType = value;
break;

case FieldForIdentifiedPayloadType:
IdentifiedPayloadType = value;
break;

case FieldForInfoId:
InfoId = Utils.RemoveBracketsFromUri(value);
break;
Expand Down
3 changes: 1 addition & 2 deletions src/Toimik.WarcProtocol/Records/Record.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ internal static string ToString(string field, object? value)
internal virtual void SetContentBlock(byte[] contentBlock, bool isParsed = true)
{
/* Depending on the record's type, a content block consists of a record block and / or a
* payload. If both exists, they are delimited by a consecutive pair of '\r\n' where the
* first pair is found at the end of a line and the other is on its own line.
* payload. The subclasses are responsible to detect those values.
*/

if (!isParsed)
Expand Down
22 changes: 17 additions & 5 deletions src/Toimik.WarcProtocol/Records/RequestRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public RequestRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
string? truncatedReason = null,
Expand All @@ -80,6 +81,7 @@ public RequestRecord(
infoId,
targetUri,
payloadDigest,
identifiedPayloadType,
ipAddress,
concurrentTos,
truncatedReason,
Expand All @@ -97,6 +99,7 @@ public RequestRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
string? truncatedReason = null,
Expand All @@ -115,6 +118,7 @@ public RequestRecord(
SetContentBlock(contentBlock, isParsed);

PayloadDigest = payloadDigest;
IdentifiedPayloadType ??= identifiedPayloadType;
if (contentBlock.Length > 0)
{
ContentType = contentType;
Expand Down Expand Up @@ -176,22 +180,22 @@ internal override void SetContentBlock(byte[] contentBlock, bool isParsed = true
if (index == -1)
{
RecordBlock = Encoding.UTF8.GetString(contentBlock);
Payload = Array.Empty<byte>();
}
else
{
RecordBlock = Encoding.UTF8.GetString(contentBlock[0..index]);
Payload = contentBlock[(index + (WarcParser.CrLf.Length * 2))..];
Payload = contentBlock[(index + PayloadTypeIdentifier.Delimiter.Length)..];
if (!isParsed)
{
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(Payload);
}
}

ContentBlock = contentBlock;
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(Payload);
}

protected internal override void Set(string field, string value)
{
// NOTE: FieldForIdentifiedPayloadType, if any, is ignored because it is supposed to be
// auto detected when the content block is set
switch (field.ToLower())
{
case FieldForConcurrentTo:
Expand All @@ -202,6 +206,10 @@ protected internal override void Set(string field, string value)
ContentType = value;
break;

case FieldForIdentifiedPayloadType:
IdentifiedPayloadType = value;
break;

case FieldForInfoId:
InfoId = Utils.RemoveBracketsFromUri(value);
break;
Expand Down Expand Up @@ -255,6 +263,10 @@ protected internal override void Set(string field, string value)
text = $"WARC-Date: {Utils.FormatDate(Date)}{WarcParser.CrLf}";
break;

case FieldForIdentifiedPayloadType:
text = ToString("WARC-Identified-Payload-Type", IdentifiedPayloadType);
break;

case FieldForInfoId:
text = ToString("WARC-Warcinfo-ID", Utils.AddBracketsToUri(InfoId));
break;
Expand Down
15 changes: 12 additions & 3 deletions src/Toimik.WarcProtocol/Records/ResourceRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public ResourceRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
bool isSegmented = false,
Expand All @@ -83,6 +84,7 @@ public ResourceRecord(
infoId,
targetUri,
payloadDigest,
identifiedPayloadType,
ipAddress,
concurrentTos,
isSegmented,
Expand All @@ -101,6 +103,7 @@ public ResourceRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
bool isSegmented = false,
Expand All @@ -120,6 +123,7 @@ public ResourceRecord(
SetContentBlock(recordBlock, isParsed);

PayloadDigest = payloadDigest;
IdentifiedPayloadType ??= identifiedPayloadType;
if (recordBlock.Length > 0)
{
ContentType = contentType;
Expand Down Expand Up @@ -181,13 +185,14 @@ internal override void SetContentBlock(byte[] contentBlock, bool isParsed = true
{
base.SetContentBlock(contentBlock, isParsed);
RecordBlock = contentBlock;
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(RecordBlock);
if (!isParsed)
{
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(RecordBlock);
}
}

protected internal override void Set(string field, string value)
{
// NOTE: FieldForIdentifiedPayloadType, if any, is ignored because it is supposed to be
// auto detected when the content block is set
switch (field.ToLower())
{
case FieldForConcurrentTo:
Expand All @@ -198,6 +203,10 @@ protected internal override void Set(string field, string value)
ContentType = value;
break;

case FieldForIdentifiedPayloadType:
IdentifiedPayloadType = value;
break;

case FieldForInfoId:
InfoId = Utils.RemoveBracketsFromUri(value);
break;
Expand Down
15 changes: 12 additions & 3 deletions src/Toimik.WarcProtocol/Records/ResponseRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ public ResponseRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
bool isSegmented = false,
Expand All @@ -84,6 +85,7 @@ public ResponseRecord(
infoId,
targetUri,
payloadDigest,
identifiedPayloadType,
ipAddress,
concurrentTos,
isSegmented,
Expand All @@ -102,6 +104,7 @@ public ResponseRecord(
Uri infoId,
Uri targetUri,
string? payloadDigest = null,
string? identifiedPayloadType = null,
IPAddress? ipAddress = null,
ISet<Uri>? concurrentTos = null,
bool isSegmented = false,
Expand All @@ -121,6 +124,7 @@ public ResponseRecord(
SetContentBlock(contentBlock, isParsed);

PayloadDigest = payloadDigest;
IdentifiedPayloadType ??= identifiedPayloadType;
if (contentBlock.Length > 0)
{
ContentType = contentType;
Expand Down Expand Up @@ -195,14 +199,15 @@ internal override void SetContentBlock(byte[] contentBlock, bool isParsed = true
{
RecordBlock = Encoding.UTF8.GetString(contentBlock[0..index]);
Payload = contentBlock[(index + PayloadTypeIdentifier.Delimiter.Length)..];
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(Payload);
if (!isParsed)
{
IdentifiedPayloadType = PayloadTypeIdentifier.Identify(Payload);
}
}
}

protected internal override void Set(string field, string value)
{
// NOTE: FieldForIdentifiedPayloadType, if any, is ignored because it is supposed to be
// auto detected when the content block is set
switch (field.ToLower())
{
case FieldForConcurrentTo:
Expand All @@ -213,6 +218,10 @@ protected internal override void Set(string field, string value)
ContentType = value;
break;

case FieldForIdentifiedPayloadType:
IdentifiedPayloadType = value;
break;

case FieldForInfoId:
InfoId = Utils.RemoveBracketsFromUri(value);
break;
Expand Down
4 changes: 2 additions & 2 deletions src/Toimik.WarcProtocol/Toimik.WarcProtocol.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
<TargetFramework>net6.0</TargetFramework>
<Nullable>enable</Nullable>
<AssemblyName>Toimik.WarcProtocol</AssemblyName>
<PackageVersion>0.7.2</PackageVersion>
<PackageVersion>0.8.0</PackageVersion>
<Authors>Nurhafiz</Authors>
<Version>0.7.2</Version>
<Version>0.8.0</Version>
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
<Company>Toimik</Company>
<Description>
Expand Down
Loading

0 comments on commit 2345d2d

Please sign in to comment.