refactor: dedupe html fetch
This commit is contained in:
@@ -278,36 +278,16 @@ func addCommonHeaders(request *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*goquery.Document, string, error) {
|
func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*goquery.Document, string, error) {
|
||||||
client := httpClient
|
document, response, err := netutil.FetchHTMLDocument(ctx, httpClient, url, addCommonHeaders, func(response *http.Response, body []byte) error {
|
||||||
if client == nil {
|
return &HTTPStatusError{
|
||||||
client = http.DefaultClient
|
|
||||||
}
|
|
||||||
|
|
||||||
request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, url, fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
|
||||||
addCommonHeaders(request)
|
|
||||||
|
|
||||||
response, err := client.Do(request)
|
|
||||||
if err != nil {
|
|
||||||
return nil, url, fmt.Errorf("request failed: %w", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = response.Body.Close() }()
|
|
||||||
|
|
||||||
if response.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(io.LimitReader(response.Body, netutil.Bytes512))
|
|
||||||
return nil, url, &HTTPStatusError{
|
|
||||||
StatusCode: response.StatusCode,
|
StatusCode: response.StatusCode,
|
||||||
URL: url,
|
URL: url,
|
||||||
ContentType: strings.TrimSpace(response.Header.Get("Content-Type")),
|
ContentType: strings.TrimSpace(response.Header.Get("Content-Type")),
|
||||||
BodyPreview: strings.Join(strings.Fields(strings.TrimSpace(string(body))), " "),
|
BodyPreview: strings.Join(strings.Fields(strings.TrimSpace(string(body))), " "),
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
|
|
||||||
document, err := goquery.NewDocumentFromReader(response.Body)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, url, fmt.Errorf("failed to parse html: %w", err)
|
return nil, url, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return document, response.Request.URL.String(), nil
|
return document, response.Request.URL.String(), nil
|
||||||
|
|||||||
@@ -86,28 +86,8 @@ func addCommonHeaders(request *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*goquery.Document, error) {
|
func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*goquery.Document, error) {
|
||||||
client := httpClient
|
document, _, err := netutil.FetchHTMLDocument(ctx, httpClient, url, addCommonHeaders, func(response *http.Response, body []byte) error {
|
||||||
if client == nil {
|
return &HTTPStatusError{
|
||||||
client = http.DefaultClient
|
|
||||||
}
|
|
||||||
|
|
||||||
request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
addCommonHeaders(request)
|
|
||||||
|
|
||||||
response, err := client.Do(request)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("request failed: %w", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = response.Body.Close() }()
|
|
||||||
|
|
||||||
if response.StatusCode != http.StatusOK {
|
|
||||||
// limit body read for error context; avoid reading large error pages
|
|
||||||
body, _ := io.ReadAll(io.LimitReader(response.Body, netutil.Bytes512))
|
|
||||||
return nil, &HTTPStatusError{
|
|
||||||
StatusCode: response.StatusCode,
|
StatusCode: response.StatusCode,
|
||||||
URL: url,
|
URL: url,
|
||||||
Server: strings.TrimSpace(response.Header.Get("Server")),
|
Server: strings.TrimSpace(response.Header.Get("Server")),
|
||||||
@@ -116,14 +96,8 @@ func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*g
|
|||||||
ContentType: strings.TrimSpace(response.Header.Get("Content-Type")),
|
ContentType: strings.TrimSpace(response.Header.Get("Content-Type")),
|
||||||
BodyPreview: strings.Join(strings.Fields(strings.TrimSpace(string(body))), " "),
|
BodyPreview: strings.Join(strings.Fields(strings.TrimSpace(string(body))), " "),
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
|
return document, err
|
||||||
document, err := goquery.NewDocumentFromReader(response.Body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse html: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return document, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractTypeLabelsByID(doc *goquery.Document) map[int]string {
|
func extractTypeLabelsByID(doc *goquery.Document) map[int]string {
|
||||||
|
|||||||
49
pkg/net/document.go
Normal file
49
pkg/net/document.go
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
package netutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
)
|
||||||
|
|
||||||
|
func FetchHTMLDocument(
|
||||||
|
ctx context.Context,
|
||||||
|
httpClient *http.Client,
|
||||||
|
url string,
|
||||||
|
prepareRequest func(*http.Request),
|
||||||
|
buildStatusError func(*http.Response, []byte) error,
|
||||||
|
) (*goquery.Document, *http.Response, error) {
|
||||||
|
client := httpClient
|
||||||
|
if client == nil {
|
||||||
|
client = http.DefaultClient
|
||||||
|
}
|
||||||
|
|
||||||
|
request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("failed to create request: %w", err)
|
||||||
|
}
|
||||||
|
if prepareRequest != nil {
|
||||||
|
prepareRequest(request)
|
||||||
|
}
|
||||||
|
|
||||||
|
response, err := client.Do(request)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("request failed: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = response.Body.Close() }()
|
||||||
|
|
||||||
|
if response.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(io.LimitReader(response.Body, Bytes512))
|
||||||
|
return nil, response, buildStatusError(response, body)
|
||||||
|
}
|
||||||
|
|
||||||
|
document, err := goquery.NewDocumentFromReader(response.Body)
|
||||||
|
if err != nil {
|
||||||
|
return nil, response, fmt.Errorf("failed to parse html: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return document, response, nil
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user