-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconst.go
More file actions
184 lines (160 loc) · 4.85 KB
/
const.go
File metadata and controls
184 lines (160 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package xparse
// Core extraction configuration keys
const (
// Index specifies which elements to extract from results
// Formats: "_index" or "_i"
// Values:
// - nil/not existed: get all elements
// - array: [0,1] gets elements[0] and elements[1]
// - single: 0 gets elements[0]
// Index types:
// 1. without index
// 2. index: ~ (index is null)
// 3. index: 0
// 4. index: [0, 1, ...]
// 5. index: 0,4 => 0,1,2,3
Index = "_index"
// Locator specifies the path/selector to find desired elements
// Formats: "_locator" or "_l"
// Supported types:
// > string:
// _locator: string
//
// > list:
// _locator:
// - div.001
// - div.002
// - div.003
//
// > map:
// _locator:
// key1: div.001
// key2: div.002
// key3: div.003
Locator = "_locator"
// Raw represents the "_raw" configuration key.
// When this key exists in a configuration map:
// - The value will be returned as-is without processing
// - Any "_locator" settings will be ignored
Raw = "_raw"
// Element navigation keys
// ExtractPrevElem is used when no proper locator exists
// in most cases, we can use locator to get the elem we want,
// but in some rare cases, there is no proper locator to use, so we have to use this to get prev elem
ExtractPrevElem = "_extract_prev"
ExtractParent = "_extract_parent"
)
// Attribute related configuration keys
const (
// Attr specifies which attribute to extract
// Default is element text
// Special value "__html" returns raw HTML
Attr = "_attr"
// AttrRefine specifies how to refine the extracted attribute
// Formats: "_attr_refine" or "_ar"
// Values:
// - bool(true): auto-generate method name
// - string(_name): adds prefix "refine" so "_xxx" becomes "_refine_name"
// - string(refine_xxx/_refine_xxx): used as-is
// - string(not started with _): used as-is
AttrRefine = "_attr_refine"
// AttrJoiner specifies the joiner for attributes
AttrJoiner = "_joiner"
// AttrIndex configuration:
// - _joiner: ","
// - _attr_refine: _attr_by_index
// - _attr_index: 0
AttrIndex = "_attr_index"
AttrRegex = "_attr_regex"
// AttrPython runs Python script directly (requires Python environment)
// Example:
// import sys
// raw = sys.argv[1] # raw is globally registered
// arr = raw.split("_")
// print(arr[1]) # required: output value as refined attr value
AttrPython = "_attr_python"
// AttrJS runs JavaScript code
// Example:
// arr = raw.split("_") // raw is registered by default
// refined = arr[1] // refined is required value
// Note: Underscore.js (https://underscorejs.org/) is supported by default
AttrJS = "_attr_js"
)
// Post-processing configuration keys
const (
// PostJoin joins parsed attributes array into string using joiner
PostJoin = "_post_join"
// Omit Empty value
OmitEmpty = "_omit_empty"
// Strip controls string trimming
// Values:
// - if `_strip: true` or not existed: does strings.TrimSpace
// - if `_strip: str`: does strings.ReplaceAll(raw, str, "")
// - if `_strip: ["(", ")"]`: replaces one by one
// Note: Called by default, use `_strip: false` to disable
Strip = "_strip"
// Type converts output to specified type
// Without `_type: b/i/f`, returns as string
// Values:
// - b: bool
// - i: int
// - f: float
Type = "_type"
)
// Abbreviated keys
const (
LocatorAbbr = "_l"
IndexAbbr = "_i"
AttrRefineAbbr = "_ar"
TypeAbbr = "_t"
)
// Special locators and internal constants
const (
// JSONArrayRootLocator is used for JSON arrays without root object
// Used when JSON file has ordered list of values like: `[{...}, {...}]`
JSONArrayRootLocator = "*/*"
// PrefixLocatorStub for multiple locators not in same stub
// Recalculates from base locator (map root)
// Example:
// jobs:
// _locator: jobs
// _index:
// taxo:
// _locator: taxonomyAttributes
// _index: 0
// attr:
// _locator:
// - attributes
// - ___.salarySnippet
PrefixLocatorStub = "___"
// _prefixRefine defines the word we use as the prefix of method of attr refiner
_prefixRefine = "_refine"
// AttrJoinerSep is a separator used to join an array to string
AttrJoinerSep = "|||"
)
// Special attribute values
const (
// AttrJoinElemsText joins all elements inner text to string
// Used only when parsing HTML
// Warning: Rarely used, consider alternatives
AttrJoinElemsText = "__join_text"
// AttrRawHTML returns the raw html of locator
AttrRawHTML = "__html"
// RefineWithKeyName uses key name as refiner method
// Example:
// root:
// a_changeable_name:
// _locator: div.xxx
// _attr: title
// _attr_refine: __key
RefineWithKeyName = "__key"
)
// Type constants
const (
AttrTypeB = "b" // Boolean
AttrTypeF = "f" // Float
AttrTypeI = "i" // Integer
// Time types
AttrTypeT = "t" // Quick mode
AttrTypeT1 = "t1" // Search mode
)