diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0518976d..af66fe37 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -36,7 +36,8 @@ jobs: - name: Build dlc.dat and plain lists run: | cd code || exit 1 - go run ./ --outputdir=../ --exportlists=_all_,category-ads-all,tld-cn,cn,tld-\!cn,geolocation-\!cn,apple,icloud + go run ./ --outputdir=../ --exportlists=category-ads-all,tld-cn,cn,tld-\!cn,geolocation-\!cn,apple,icloud + go run ./cmd/datdump/main.go --inputdata=../dlc.dat --outputdir=../ --exportlists=_all_ cd ../ && rm -rf code - name: Generate dlc.dat sha256 hash diff --git a/.gitignore b/.gitignore index e6876bd7..b8beb384 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,5 @@ dlc.dat # Exported plaintext lists. -dlc.dat_plain.yml +/*.yml /*.txt diff --git a/cmd/datdump/main.go b/cmd/datdump/main.go new file mode 100644 index 00000000..fe24d652 --- /dev/null +++ b/cmd/datdump/main.go @@ -0,0 +1,164 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/v2fly/domain-list-community/internal/dlc" + router "github.com/v2fly/v2ray-core/v5/app/router/routercommon" + "google.golang.org/protobuf/proto" +) + +var ( + inputData = flag.String("inputdata", "dlc.dat", "Name of the geosite dat file") + outputDir = flag.String("outputdir", "./", "Directory to place all generated files") + exportLists = flag.String("exportlists", "", "Lists to be exported, separated by ',' (empty for _all_)") +) + +type DomainRule struct { + Type string + Value string + Attrs []string +} + +type DomainList struct { + Name string + Rules []DomainRule +} + +func (d *DomainRule) domain2String() string { + dstring := d.Type + ":" + d.Value + if len(d.Attrs) != 0 { + dstring += ":@" + strings.Join(d.Attrs, ",@") + } + return dstring +} + +func loadGeosite(path string) ([]DomainList, map[string]*DomainList, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, nil, fmt.Errorf("failed to read geosite file: %w", err) + } + vgeositeList := new(router.GeoSiteList) + if err := proto.Unmarshal(data, vgeositeList); err != nil { + return nil, nil, fmt.Errorf("failed to unmarshal: %w", err) + } + domainLists := make([]DomainList, len(vgeositeList.Entry)) + domainListByName := make(map[string]*DomainList, len(vgeositeList.Entry)) + for i, vsite := range vgeositeList.Entry { + rules := make([]DomainRule, 0, len(vsite.Domain)) + for _, vdomain := range vsite.Domain { + rule := DomainRule{Value: vdomain.Value} + switch vdomain.Type { + case router.Domain_RootDomain: + rule.Type = dlc.RuleTypeDomain + case router.Domain_Regex: + rule.Type = dlc.RuleTypeRegexp + case router.Domain_Plain: + rule.Type = dlc.RuleTypeKeyword + case router.Domain_Full: + rule.Type = dlc.RuleTypeFullDomain + default: + return nil, nil, fmt.Errorf("invalid rule type: %+v", vdomain.Type) + } + for _, vattr := range vdomain.Attribute { + rule.Attrs = append(rule.Attrs, vattr.Key) + } + rules = append(rules, rule) + } + domainLists[i] = DomainList{ + Name: strings.ToUpper(vsite.CountryCode), + Rules: rules, + } + domainListByName[domainLists[i].Name] = &domainLists[i] + } + return domainLists, domainListByName, nil +} + +func exportSite(name string, domainListByName map[string]*DomainList) error { + domainList, ok := domainListByName[strings.ToUpper(name)] + if !ok { + return fmt.Errorf("list '%s' does not exist", name) + } + if len(domainList.Rules) == 0 { + return fmt.Errorf("list '%s' is empty", name) + } + file, err := os.Create(filepath.Join(*outputDir, name+".yml")) + if err != nil { + return err + } + defer file.Close() + w := bufio.NewWriter(file) + fmt.Fprintf(w, "%s:\n", name) + for _, domain := range domainList.Rules { + fmt.Fprintf(w, " - %q\n", domain.domain2String()) + } + return w.Flush() +} + +func exportAll(filename string, domainLists []DomainList) error { + file, err := os.Create(filepath.Join(*outputDir, filename)) + if err != nil { + return err + } + defer file.Close() + w := bufio.NewWriter(file) + w.WriteString("lists:\n") + for _, domainList := range domainLists { + fmt.Fprintf(w, " - name: %s\n", strings.ToLower(domainList.Name)) + fmt.Fprintf(w, " length: %d\n", len(domainList.Rules)) + w.WriteString(" rules:\n") + for _, domain := range domainList.Rules { + fmt.Fprintf(w, " - %q\n", domain.domain2String()) + } + } + return w.Flush() +} + +func main() { + flag.Parse() + + // Create output directory if not exist + if _, err := os.Stat(*outputDir); os.IsNotExist(err) { + if mkErr := os.MkdirAll(*outputDir, 0755); mkErr != nil { + fmt.Println("Failed to create output directory:", mkErr) + os.Exit(1) + } + } + + fmt.Printf("Loading %s...\n", *inputData) + domainLists, domainListByName, err := loadGeosite(*inputData) + if err != nil { + fmt.Println("Failed to loadGeosite:", err) + os.Exit(1) + } + + var exportListSlice []string + for raw := range strings.SplitSeq(*exportLists, ",") { + if trimmed := strings.TrimSpace(raw); trimmed != "" { + exportListSlice = append(exportListSlice, trimmed) + } + } + if len(exportListSlice) == 0 { + exportListSlice = []string{"_all_"} + } + + for _, eplistname := range exportListSlice { + if strings.EqualFold(eplistname, "_all_") { + if err := exportAll(filepath.Base(*inputData)+"_plain.yml", domainLists); err != nil { + fmt.Println("Failed to exportAll:", err) + continue + } + } else { + if err := exportSite(eplistname, domainListByName); err != nil { + fmt.Println("Failed to exportSite:", err) + continue + } + } + fmt.Printf("list: '%s' has been exported successfully.\n", eplistname) + } +} diff --git a/internal/dlc/dlc.go b/internal/dlc/dlc.go new file mode 100644 index 00000000..542f35ff --- /dev/null +++ b/internal/dlc/dlc.go @@ -0,0 +1,9 @@ +package dlc + +const ( + RuleTypeDomain string = "domain" + RuleTypeFullDomain string = "full" + RuleTypeKeyword string = "keyword" + RuleTypeRegexp string = "regexp" + RuleTypeInclude string = "include" +) diff --git a/main.go b/main.go index 4708b0e8..46ef1fb4 100644 --- a/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "slices" "strings" + "github.com/v2fly/domain-list-community/internal/dlc" router "github.com/v2fly/v2ray-core/v5/app/router/routercommon" "google.golang.org/protobuf/proto" ) @@ -21,21 +22,6 @@ var ( exportLists = flag.String("exportlists", "", "Lists to be flattened and exported in plaintext format, separated by ',' comma") ) -const ( - RuleTypeDomain string = "domain" - RuleTypeFullDomain string = "full" - RuleTypeKeyword string = "keyword" - RuleTypeRegexp string = "regexp" - RuleTypeInclude string = "include" -) - -var ( - TypeChecker = regexp.MustCompile(`^(domain|full|keyword|regexp|include)$`) - DomainChecker = regexp.MustCompile(`^[a-z0-9\.-]+$`) - AttrChecker = regexp.MustCompile(`^[a-z0-9!-]+$`) - SiteChecker = regexp.MustCompile(`^[A-Z0-9!-]+$`) -) - var ( refMap = make(map[string][]*Entry) plMap = make(map[string]*ParsedList) @@ -66,7 +52,7 @@ type ParsedList struct { func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) { site := &router.GeoSite{ CountryCode: listName, - Domain: make([]*router.Domain, 0, len(entries)), + Domain: make([]*router.Domain, 0, len(entries)), } for _, entry := range entries { pdomain := &router.Domain{Value: entry.Value} @@ -78,13 +64,13 @@ func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) { } switch entry.Type { - case RuleTypeDomain: + case dlc.RuleTypeDomain: pdomain.Type = router.Domain_RootDomain - case RuleTypeRegexp: + case dlc.RuleTypeRegexp: pdomain.Type = router.Domain_Regex - case RuleTypeKeyword: + case dlc.RuleTypeKeyword: pdomain.Type = router.Domain_Plain - case RuleTypeFullDomain: + case dlc.RuleTypeFullDomain: pdomain.Type = router.Domain_Full } site.Domain = append(site.Domain, pdomain) @@ -92,31 +78,12 @@ func makeProtoList(listName string, entries []*Entry) (*router.GeoSite, error) { return site, nil } -func writePlainAll(siteList *[]string) error { - file, err := os.Create(filepath.Join(*outputDir, *outputName + "_plain.yml")) - if err != nil { - return err - } - defer file.Close() - w := bufio.NewWriter(file) - w.WriteString("lists:\n") - for _, site := range *siteList { - fmt.Fprintf(w, " - name: %s\n", strings.ToLower(site)) - fmt.Fprintf(w, " length: %d\n", len(finalMap[site])) - w.WriteString(" rules:\n") - for _, entry := range finalMap[site] { - fmt.Fprintf(w, " - %s\n", entry.Plain) - } - } - return w.Flush() -} - func writePlainList(exportedName string) error { targetList, exist := finalMap[strings.ToUpper(exportedName)] if !exist || len(targetList) == 0 { - return fmt.Errorf("'%s' list does not exist or is empty.", exportedName) + return fmt.Errorf("list %q does not exist or is empty.", exportedName) } - file, err := os.Create(filepath.Join(*outputDir, strings.ToLower(exportedName) + ".txt")) + file, err := os.Create(filepath.Join(*outputDir, strings.ToLower(exportedName)+".txt")) if err != nil { return err } @@ -131,41 +98,43 @@ func writePlainList(exportedName string) error { func parseEntry(line string) (Entry, error) { var entry Entry parts := strings.Fields(line) + if len(parts) == 0 { + return entry, fmt.Errorf("empty line: %q", line) + } // Parse type and value - rawTypeVal := parts[0] - kv := strings.Split(rawTypeVal, ":") - if len(kv) == 1 { - entry.Type = RuleTypeDomain // Default type - entry.Value = strings.ToLower(rawTypeVal) - } else if len(kv) == 2 { - entry.Type = strings.ToLower(kv[0]) - if entry.Type == RuleTypeRegexp { - entry.Value = kv[1] - } else if entry.Type == RuleTypeInclude { - entry.Value = strings.ToUpper(kv[1]) - } else { - entry.Value = strings.ToLower(kv[1]) + v := parts[0] + colonIndex := strings.Index(v, ":") + if colonIndex == -1 { + entry.Type = dlc.RuleTypeDomain // Default type + entry.Value = strings.ToLower(v) + if !validateDomainChars(entry.Value) { + return entry, fmt.Errorf("invalid domain: %q", entry.Value) } } else { - return entry, fmt.Errorf("invalid format: %s", line) - } - // Check type and value - if !TypeChecker.MatchString(entry.Type) { - return entry, fmt.Errorf("invalid type: %s", entry.Type) - } - switch entry.Type { - case RuleTypeRegexp: - if _, err := regexp.Compile(entry.Value); err != nil { - return entry, fmt.Errorf("invalid regexp: %s", entry.Value) - } - case RuleTypeInclude: - if !SiteChecker.MatchString(entry.Value) { - return entry, fmt.Errorf("invalid included list name: %s", entry.Value) - } - default: // `full`, `domain` and `keyword` are all (parts of) domains - if !DomainChecker.MatchString(entry.Value) { - return entry, fmt.Errorf("invalid domain: %s", entry.Value) + typ := strings.ToLower(v[:colonIndex]) + val := v[colonIndex+1:] + switch typ { + case dlc.RuleTypeRegexp: + if _, err := regexp.Compile(val); err != nil { + return entry, fmt.Errorf("invalid regexp %q: %w", val, err) + } + entry.Type = dlc.RuleTypeRegexp + entry.Value = val + case dlc.RuleTypeInclude: + entry.Type = dlc.RuleTypeInclude + entry.Value = strings.ToUpper(val) + if !validateSiteName(entry.Value) { + return entry, fmt.Errorf("invalid include list name: %q", entry.Value) + } + case dlc.RuleTypeDomain, dlc.RuleTypeFullDomain, dlc.RuleTypeKeyword: + entry.Type = typ + entry.Value = strings.ToLower(val) + if !validateDomainChars(entry.Value) { + return entry, fmt.Errorf("invalid domain: %q", entry.Value) + } + default: + return entry, fmt.Errorf("invalid type: %q", typ) } } @@ -173,18 +142,18 @@ func parseEntry(line string) (Entry, error) { for _, part := range parts[1:] { if strings.HasPrefix(part, "@") { attr := strings.ToLower(part[1:]) // Trim attribute prefix `@` character - if !AttrChecker.MatchString(attr) { - return entry, fmt.Errorf("invalid attribute key: %s", attr) + if !validateAttrChars(attr) { + return entry, fmt.Errorf("invalid attribute: %q", attr) } entry.Attrs = append(entry.Attrs, attr) } else if strings.HasPrefix(part, "&") { aff := strings.ToUpper(part[1:]) // Trim affiliation prefix `&` character - if !SiteChecker.MatchString(aff) { - return entry, fmt.Errorf("invalid affiliation key: %s", aff) + if !validateSiteName(aff) { + return entry, fmt.Errorf("invalid affiliation: %q", aff) } entry.Affs = append(entry.Affs, aff) } else { - return entry, fmt.Errorf("invalid attribute/affiliation: %s", part) + return entry, fmt.Errorf("invalid attribute/affiliation: %q", part) } } // Sort attributes @@ -198,6 +167,39 @@ func parseEntry(line string) (Entry, error) { return entry, nil } +func validateDomainChars(domain string) bool { + for i := range domain { + c := domain[i] + if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '.' || c == '-' { + continue + } + return false + } + return true +} + +func validateAttrChars(attr string) bool { + for i := range attr { + c := attr[i] + if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '!' || c == '-' { + continue + } + return false + } + return true +} + +func validateSiteName(name string) bool { + for i := range name { + c := name[i] + if (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '!' || c == '-' { + continue + } + return false + } + return true +} + func loadData(path string) error { file, err := os.Open(path) if err != nil { @@ -206,7 +208,7 @@ func loadData(path string) error { defer file.Close() listName := strings.ToUpper(filepath.Base(path)) - if !SiteChecker.MatchString(listName) { + if !validateSiteName(listName) { return fmt.Errorf("invalid list name: %s", listName) } scanner := bufio.NewScanner(file) @@ -238,7 +240,7 @@ func parseList(refName string, refList []*Entry) error { plMap[refName] = pl } for _, entry := range refList { - if entry.Type == RuleTypeInclude { + if entry.Type == dlc.RuleTypeInclude { if len(entry.Affs) != 0 { return fmt.Errorf("affiliation is not allowed for include:%s", entry.Value) } @@ -272,18 +274,18 @@ func polishList(roughMap *map[string]*Entry) []*Entry { domainsMap := make(map[string]bool) for _, entry := range *roughMap { switch entry.Type { // Bypass regexp, keyword and "full/domain with attr" - case RuleTypeRegexp: + case dlc.RuleTypeRegexp: finalList = append(finalList, entry) - case RuleTypeKeyword: + case dlc.RuleTypeKeyword: finalList = append(finalList, entry) - case RuleTypeDomain: + case dlc.RuleTypeDomain: domainsMap[entry.Value] = true if len(entry.Attrs) != 0 { finalList = append(finalList, entry) } else { queuingList = append(queuingList, entry) } - case RuleTypeFullDomain: + case dlc.RuleTypeFullDomain: if len(entry.Attrs) != 0 { finalList = append(finalList, entry) } else { @@ -295,14 +297,18 @@ func polishList(roughMap *map[string]*Entry) []*Entry { for _, qentry := range queuingList { isRedundant := false pd := qentry.Value // To be parent domain - if qentry.Type == RuleTypeFullDomain { + if qentry.Type == dlc.RuleTypeFullDomain { pd = "." + pd // So that `domain:example.org` overrides `full:example.org` } for { idx := strings.Index(pd, ".") - if idx == -1 { break } + if idx == -1 { + break + } pd = pd[idx+1:] // Go for next parent - if !strings.Contains(pd, ".") { break } // Not allow tld to be a parent + if !strings.Contains(pd, ".") { + break + } // Not allow tld to be a parent if domainsMap[pd] { isRedundant = true break @@ -320,7 +326,9 @@ func polishList(roughMap *map[string]*Entry) []*Entry { } func resolveList(pl *ParsedList) error { - if _, pldone := finalMap[pl.Name]; pldone { return nil } + if _, pldone := finalMap[pl.Name]; pldone { + return nil + } if cirIncMap[pl.Name] { return fmt.Errorf("circular inclusion in: %s", pl.Name) @@ -329,14 +337,22 @@ func resolveList(pl *ParsedList) error { defer delete(cirIncMap, pl.Name) isMatchAttrFilters := func(entry *Entry, incFilter *Inclusion) bool { - if len(incFilter.MustAttrs) == 0 && len(incFilter.BanAttrs) == 0 { return true } - if len(entry.Attrs) == 0 { return len(incFilter.MustAttrs) == 0 } + if len(incFilter.MustAttrs) == 0 && len(incFilter.BanAttrs) == 0 { + return true + } + if len(entry.Attrs) == 0 { + return len(incFilter.MustAttrs) == 0 + } for _, m := range incFilter.MustAttrs { - if !slices.Contains(entry.Attrs, m) { return false } + if !slices.Contains(entry.Attrs, m) { + return false + } } for _, b := range incFilter.BanAttrs { - if slices.Contains(entry.Attrs, b) { return false } + if slices.Contains(entry.Attrs, b) { + return false + } } return true } @@ -348,7 +364,7 @@ func resolveList(pl *ParsedList) error { for _, inc := range pl.Inclusions { incPl, exist := plMap[inc.Source] if !exist { - return fmt.Errorf("list '%s' includes a non-existent list: '%s'", pl.Name, inc.Source) + return fmt.Errorf("list %q includes a non-existent list: %q", pl.Name, inc.Source) } if err := resolveList(incPl); err != nil { return err @@ -395,54 +411,51 @@ func main() { } } - // Generate finalMap and sorted list of site names - siteList := make([]string, 0 ,len(plMap)) + // Generate finalMap for _, pl := range plMap { - siteList = append(siteList, pl.Name) if err := resolveList(pl); err != nil { fmt.Println("Failed to resolveList:", err) os.Exit(1) } } - slices.Sort(siteList) // Create output directory if not exist if _, err := os.Stat(*outputDir); os.IsNotExist(err) { if mkErr := os.MkdirAll(*outputDir, 0755); mkErr != nil { - fmt.Println("Failed:", mkErr) + fmt.Println("Failed to create output directory:", mkErr) os.Exit(1) } } // Export plaintext list - if *exportLists != "" { - exportedListSlice := strings.Split(*exportLists, ",") - for _, exportedList := range exportedListSlice { - if exportedList == "_all_" { - if err := writePlainAll(&siteList); err != nil { - fmt.Println("Failed to writePlainAll:", err) - continue - } - } else { - if err := writePlainList(exportedList); err != nil { - fmt.Println("Failed to write list:", err) - continue - } - } - fmt.Printf("list: '%s' has been generated successfully.\n", exportedList) + var exportListSlice []string + for raw := range strings.SplitSeq(*exportLists, ",") { + if trimmed := strings.TrimSpace(raw); trimmed != "" { + exportListSlice = append(exportListSlice, trimmed) } } + for _, exportList := range exportListSlice { + if err := writePlainList(exportList); err != nil { + fmt.Println("Failed to write list:", err) + continue + } + fmt.Printf("list %q has been generated successfully.\n", exportList) + } // Generate dat file protoList := new(router.GeoSiteList) - for _, siteName := range siteList { // So that protoList.Entry is sorted - site, err := makeProtoList(siteName, finalMap[siteName]) + for siteName, siteEntries := range finalMap { + site, err := makeProtoList(siteName, siteEntries) if err != nil { fmt.Println("Failed to makeProtoList:", err) os.Exit(1) } protoList.Entry = append(protoList.Entry, site) } + // Sort protoList so the marshaled list is reproducible + slices.SortFunc(protoList.Entry, func(a, b *router.GeoSite) int { + return strings.Compare(a.CountryCode, b.CountryCode) + }) protoBytes, err := proto.Marshal(protoList) if err != nil {