From e5ece87764bc9bd449e8fbf039ff08b091b7b93a Mon Sep 17 00:00:00 2001 From: Szymon Mikitiuk Date: Thu, 1 Feb 2024 20:21:59 +0100 Subject: [PATCH 1/4] add a complete solution for link exercise using the Tokenizer --- link/main.go | 97 +++++++++++++++++++++++++++++++++++++---------- link/main_test.go | 22 ----------- 2 files changed, 76 insertions(+), 43 deletions(-) diff --git a/link/main.go b/link/main.go index 00e3ef7..f65c033 100644 --- a/link/main.go +++ b/link/main.go @@ -2,8 +2,12 @@ package main import ( "bytes" + "flag" "fmt" + "io" + "log" "os" + "strings" "golang.org/x/net/html" ) @@ -13,40 +17,91 @@ type Link struct { Text string } -func catch[T any](val T, err error) T { - if err != nil { - fmt.Println(err) - os.Exit(1) - } - return val +func (l Link) String() string { + return fmt.Sprintf("{href='%s', text='%s'}", l.Href, l.Text) } func main() { + inputFile := readFile(*parseUserInput()) + defer inputFile.Close() + links := parseLinks(*inputFile) + log.Println(links) +} + +func parseUserInput() *string { + htmlFilePath := flag.String("file", "", "Path to the HTML file") + flag.Parse() + + if *htmlFilePath == "" { + flag.Usage() + log.Fatalln("Error: HTML file path is required.") + } + + return htmlFilePath +} + +func readFile(path string) *os.File { + file, err := os.Open(path) + if err != nil { + log.Fatalf("Error reading HTML file: '%s': %v", path, err) + } - content := catch(os.ReadFile("ex3.html")) + return file +} - reader := bytes.NewReader(content) - tokenizer := html.NewTokenizer(reader) +func parseLinks(file os.File) []Link { + tokenizer := html.NewTokenizer(&file) + var links []Link + var buffer bytes.Buffer + var catchText bool + var link Link - // var links []string for { - t := tokenizer.Next() - if t == html.ErrorToken { - //fmt.Println("Error token") - // fmt.Println(t) + tokenType := tokenizer.Next() + err := processErrorToken(tokenizer, tokenType) + if err != nil { break } - // fmt.Println(z.Token().Attr) - token := tokenizer.Token() - if token.Data == "a" && len(token.Attr) > 0 { - for _, attr := range token.Attr { - if attr.Key == "href" { - fmt.Println(attr.Val) + switch tokenType { + case html.StartTagToken: + token := tokenizer.Token() + if token.DataAtom.String() == "a" && len(token.Attr) > 0 { + for _, attr := range token.Attr { + if attr.Key == "href" { + link.Href = attr.Val + catchText = true + } } } - } + case html.TextToken: + if catchText { + buffer.Write(tokenizer.Raw()) + } + + case html.EndTagToken: + token := tokenizer.Token() + if token.DataAtom.String() == "a" { + link.Text = strings.TrimSpace(buffer.String()) + links = append(links, link) + buffer.Reset() + catchText = false + } + } } + return links } + +func processErrorToken(tokenizer *html.Tokenizer, tokenType html.TokenType) error { + if tokenType == html.ErrorToken { + err := tokenizer.Err() + if err != io.EOF { + log.Fatalln("Error when parsing HTML", err) + } + return err + } + return nil +} + diff --git a/link/main_test.go b/link/main_test.go index 25079d7..70d5ab8 100644 --- a/link/main_test.go +++ b/link/main_test.go @@ -1,7 +1,6 @@ package main import ( - "reflect" "testing" ) @@ -17,24 +16,3 @@ func Test_main(t *testing.T) { }) } } - -func Test_catch(t *testing.T) { - type args struct { - val []byte - err error - } - tests := []struct { - name string - args args - want []byte - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := catch(tt.args.val, tt.args.err); !reflect.DeepEqual(got, tt.want) { - t.Errorf("catch() = %v, want %v", got, tt.want) - } - }) - } -} From 707a4bb85502d11735add002b628ca66b0fe6157 Mon Sep 17 00:00:00 2001 From: Szymon Mikitiuk Date: Thu, 1 Feb 2024 20:37:45 +0100 Subject: [PATCH 2/4] Add go.work.sum file Some explanation courtesy of ChatGPT: https://chat.openai.com/share/ac97becd-7552-4b8f-ba3b-80fdea5a6849 --- go.work.sum | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 go.work.sum diff --git a/go.work.sum b/go.work.sum new file mode 100644 index 0000000..c3d903c --- /dev/null +++ b/go.work.sum @@ -0,0 +1,2 @@ +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= From fbb91d654acf46df0fb6951f0c52d1eb28f971e8 Mon Sep 17 00:00:00 2001 From: Py Explorer Date: Thu, 1 Feb 2024 20:58:05 +0100 Subject: [PATCH 3/4] session-10: from switch case to else if :D --- link/main.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/link/main.go b/link/main.go index f65c033..7d57ae7 100644 --- a/link/main.go +++ b/link/main.go @@ -63,8 +63,7 @@ func parseLinks(file os.File) []Link { break } - switch tokenType { - case html.StartTagToken: + if tokenType == html.StartTagToken { token := tokenizer.Token() if token.DataAtom.String() == "a" && len(token.Attr) > 0 { for _, attr := range token.Attr { @@ -74,13 +73,11 @@ func parseLinks(file os.File) []Link { } } } - - case html.TextToken: + } else if tokenType == html.TextToken { if catchText { buffer.Write(tokenizer.Raw()) } - - case html.EndTagToken: + } else if tokenType == html.EndTagToken { token := tokenizer.Token() if token.DataAtom.String() == "a" { link.Text = strings.TrimSpace(buffer.String()) @@ -104,4 +101,3 @@ func processErrorToken(tokenizer *html.Tokenizer, tokenType html.TokenType) erro } return nil } - From 70f9d43e2eb0b124981dacce0d3fffde40abfabd Mon Sep 17 00:00:00 2001 From: Py Explorer Date: Thu, 8 Feb 2024 19:31:16 +0100 Subject: [PATCH 4/4] session-10: add comment --- link/main.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/link/main.go b/link/main.go index 7d57ae7..f2b96d4 100644 --- a/link/main.go +++ b/link/main.go @@ -29,7 +29,7 @@ func main() { } func parseUserInput() *string { - htmlFilePath := flag.String("file", "", "Path to the HTML file") + htmlFilePath := flag.String("file", "ex2.html", "Path to the HTML file") flag.Parse() if *htmlFilePath == "" { @@ -53,6 +53,8 @@ func parseLinks(file os.File) []Link { tokenizer := html.NewTokenizer(&file) var links []Link var buffer bytes.Buffer + // we can use string here as well + // var text string var catchText bool var link Link @@ -76,6 +78,7 @@ func parseLinks(file os.File) []Link { } else if tokenType == html.TextToken { if catchText { buffer.Write(tokenizer.Raw()) + // text += string(tokenizer.Raw()) } } else if tokenType == html.EndTagToken { token := tokenizer.Token() @@ -83,6 +86,7 @@ func parseLinks(file os.File) []Link { link.Text = strings.TrimSpace(buffer.String()) links = append(links, link) buffer.Reset() + // text = "" catchText = false } }