-
Notifications
You must be signed in to change notification settings - Fork 67
/
wayback.go
328 lines (291 loc) · 8.64 KB
/
wayback.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
// Copyright 2020 Wayback Archiver. All rights reserved.
// Use of this source code is governed by the GNU GPL v3
// license that can be found in the LICENSE file.
package wayback // import "github.com/wabarc/wayback"
import (
"context"
"fmt"
"net/url"
"os"
"sync"
"time"
"github.com/wabarc/logger"
"github.com/wabarc/playback"
"github.com/wabarc/rivet/ipfs"
"github.com/wabarc/wayback/config"
"github.com/wabarc/wayback/errors"
"github.com/wabarc/wayback/ingress"
"github.com/wabarc/wayback/reduxer"
"golang.org/x/sync/errgroup"
is "github.com/wabarc/archive.is"
ia "github.com/wabarc/archive.org"
ga "github.com/wabarc/ghostarchive"
ip "github.com/wabarc/rivet"
ph "github.com/wabarc/telegra.ph"
pinner "github.com/wabarc/ipfs-pinner"
)
// Collect results that archived, Arc is name of the archive service,
// Dst mapping the original URL and archived destination URL,
// Ext is extra descriptions.
type Collect struct {
Arc string // Archive slot name, see config/config.go
Dst string // Archived destination URL
Src string // Source URL
Ext string // Extra identifier
}
// IA represents the Internet Archive slot.
type IA struct {
ctx context.Context
cfg *config.Options
URL *url.URL
}
// IS represents the archive.today slot.
type IS struct {
ctx context.Context
cfg *config.Options
URL *url.URL
}
// IP represents the IPFS slot.
type IP struct {
ctx context.Context
cfg *config.Options
URL *url.URL
}
// PH represents the Telegra.ph slot.
type PH struct {
ctx context.Context
cfg *config.Options
URL *url.URL
}
// GA represents the Ghostarchive slot.
type GA struct {
ctx context.Context
cfg *config.Options
URL *url.URL
}
// Waybacker is the interface that wraps the basic Wayback method.
//
// Wayback wayback *url.URL from struct of the implementations to the Wayback Machine.
// It returns the result of string from the upstream services.
type Waybacker interface {
Wayback(reduxer.Reduxer) string
}
// Wayback implements the standard Waybacker interface:
// it reads URL from the IA and returns archived URL as a string.
func (i IA) Wayback(_ reduxer.Reduxer) string {
arc := &ia.Archiver{Client: ingress.Client()}
dst, err := arc.Wayback(i.ctx, i.URL)
if err != nil {
logger.Error("wayback %s to Internet Archive failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}
// Wayback implements the standard Waybacker interface:
// it reads URL from the IS and returns archived URL as a string.
func (i IS) Wayback(_ reduxer.Reduxer) string {
arc := is.NewArchiver(ingress.Client())
defer arc.CloseTor()
dst, err := arc.Wayback(i.ctx, i.URL)
if err != nil {
logger.Error("wayback %s to archive.today failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}
// Wayback implements the standard Waybacker interface:
// it reads URL from the Ghostarchive and returns archived URL as a string.
func (g GA) Wayback(_ reduxer.Reduxer) string {
arc := &ga.Archiver{Client: ingress.Client()}
dst, err := arc.Wayback(g.ctx, g.URL)
if err != nil {
logger.Error("wayback %s to Ghostarchive failed: %v", g.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}
// Wayback implements the standard Waybacker interface:
// it reads URL from the IP and returns archived URL as a string.
func (i IP) Wayback(rdx reduxer.Reduxer) string {
opts := []ipfs.PinningOption{
ipfs.Mode(ipfs.Remote),
}
if i.cfg.IPFSMode() == "daemon" {
opts = []ipfs.PinningOption{
ipfs.Mode(ipfs.Local),
ipfs.Host(i.cfg.IPFSHost()),
ipfs.Port(i.cfg.IPFSPort()),
}
}
target := i.cfg.IPFSTarget()
switch target {
case pinner.Infura, pinner.Pinata, pinner.NFTStorage, pinner.Web3Storage:
apikey := i.cfg.IPFSApikey()
secret := i.cfg.IPFSSecret()
opts = append(opts, ipfs.Uses(target), ipfs.Apikey(apikey), ipfs.Secret(secret))
}
arc := &ip.Shaft{Hold: ipfs.Options(opts...)}
uri := i.URL.String()
ctx := i.ctx
// If there is bundled HTML, it is utilized as the basis for IPFS
// archiving and is sent to obelisk to crawl the rest of the page.
if bundle, ok := rdx.Load(reduxer.Src(uri)); ok {
shot := bundle.Shots()
buf, err := os.ReadFile(fmt.Sprint(shot.HTML))
if err == nil {
ctx = arc.WithInput(ctx, buf)
}
}
dst, err := arc.Wayback(ctx, i.URL)
if err != nil {
logger.Error("wayback %s to IPFS failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}
// Wayback implements the standard Waybacker interface:
// it reads URL from the PH and returns archived URL as a string.
func (i PH) Wayback(rdx reduxer.Reduxer) string {
arc := ph.New(ingress.Client())
uri := i.URL.String()
ctx := i.ctx
if i.cfg.EnabledChromeRemote() {
arc.ByRemote(i.cfg.ChromeRemoteAddr())
}
if bundle, ok := rdx.Load(reduxer.Src(uri)); ok {
ctx = arc.WithShot(ctx, bundle.Shots())
ctx = arc.WithArticle(ctx, bundle.Article())
}
dst, err := arc.Wayback(ctx, i.URL)
if err != nil {
logger.Error("wayback %s to telegra.ph failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}
func wayback(w Waybacker, r reduxer.Reduxer) string {
return w.Wayback(r)
}
// Wayback returns URLs archived to the time capsules of given URLs.
func Wayback(ctx context.Context, rdx reduxer.Reduxer, cfg *config.Options, urls ...*url.URL) ([]Collect, error) {
logger.Debug("start...")
if _, ok := ctx.Deadline(); !ok {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, cfg.WaybackTimeout())
defer cancel()
}
ctx, cancel := context.WithTimeout(ctx, duration(ctx))
defer cancel()
mu := sync.Mutex{}
cols := []Collect{}
g, ctx := errgroup.WithContext(ctx)
for _, input := range urls {
for slot, arc := range cfg.Slots() {
if !arc {
logger.Warn("skipped %s", config.SlotName(slot))
continue
}
slot, input := slot, input
g.Go(func() error {
logger.Debug("archiving slot: %s", slot)
uri := input.String()
var col Collect
switch slot {
case config.SLOT_IA:
col.Dst = wayback(IA{URL: input, cfg: cfg, ctx: ctx}, rdx)
case config.SLOT_IS:
col.Dst = wayback(IS{URL: input, cfg: cfg, ctx: ctx}, rdx)
case config.SLOT_IP:
col.Dst = wayback(IP{URL: input, cfg: cfg, ctx: ctx}, rdx)
case config.SLOT_PH:
col.Dst = wayback(PH{URL: input, cfg: cfg, ctx: ctx}, rdx)
case config.SLOT_GA:
col.Dst = wayback(GA{URL: input, cfg: cfg, ctx: ctx}, rdx)
}
col.Src = uri
col.Arc = slot
col.Ext = slot
mu.Lock()
cols = append(cols, col)
mu.Unlock()
return nil
})
}
}
if err := g.Wait(); err != nil {
logger.Error("archiving some slot unexpected: %v", err)
}
if len(cols) == 0 {
return cols, errors.New("archiving failed: no cols")
}
return cols, nil
}
// Playback returns URLs archived from the time capsules.
func Playback(ctx context.Context, cfg *config.Options, urls ...*url.URL) (cols []Collect, err error) {
logger.Debug("start...")
if _, ok := ctx.Deadline(); !ok {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, cfg.WaybackTimeout())
defer cancel()
}
ctx, cancel := context.WithTimeout(ctx, duration(ctx))
defer cancel()
mu := sync.Mutex{}
g, ctx := errgroup.WithContext(ctx)
var slots = []string{
config.SLOT_IA,
config.SLOT_IS,
config.SLOT_IP,
config.SLOT_PH,
config.SLOT_GA,
config.SLOT_TT,
config.SLOT_GC,
}
for _, input := range urls {
for _, slot := range slots {
slot, input := slot, input
g.Go(func() error {
logger.Debug("searching slot: %s", slot)
var col Collect
switch slot {
case config.SLOT_IA:
col.Dst = playback.Playback(ctx, playback.IA{URL: input})
case config.SLOT_IS:
col.Dst = playback.Playback(ctx, playback.IS{URL: input})
case config.SLOT_IP:
col.Dst = playback.Playback(ctx, playback.IP{URL: input})
case config.SLOT_PH:
col.Dst = playback.Playback(ctx, playback.PH{URL: input})
case config.SLOT_GA:
col.Dst = playback.Playback(ctx, playback.GA{URL: input})
case config.SLOT_TT:
col.Dst = playback.Playback(ctx, playback.TT{URL: input})
case config.SLOT_GC:
col.Dst = playback.Playback(ctx, playback.GC{URL: input})
}
col.Src = input.String()
col.Arc = slot
col.Ext = slot
mu.Lock()
cols = append(cols, col)
mu.Unlock()
return nil
})
}
}
if err := g.Wait(); err != nil {
logger.Error("playback some slot unexpected: %v", err)
}
if len(cols) == 0 {
return cols, errors.New("playback failed: no cols")
}
return cols, nil
}
// duration reduce the context deadline time for downstream and reserve
// extra time for the caller.
func duration(ctx context.Context) time.Duration {
deadline, _ := ctx.Deadline()
elapsed := deadline.Unix() - time.Now().Unix()
safeTime := elapsed * 90 / 100
return time.Duration(safeTime) * time.Second
}