Db
6 years ago
Handler
6 years ago
TableLogAction
6 years ago
Visit
6 years ago
Action.php
6 years ago
ActionPageview.php
6 years ago
Cache.php
6 years ago
Db.php
6 years ago
Failures.php
6 years ago
FingerprintSalt.php
6 years ago
GoalManager.php
6 years ago
Handler.php
6 years ago
IgnoreCookie.php
6 years ago
LogTable.php
6 years ago
Model.php
6 years ago
PageUrl.php
6 years ago
Request.php
5 years ago
RequestProcessor.php
6 years ago
RequestSet.php
6 years ago
Response.php
6 years ago
ScheduledTasksRunner.php
6 years ago
Settings.php
5 years ago
TableLogAction.php
6 years ago
TrackerCodeGenerator.php
6 years ago
TrackerConfig.php
6 years ago
Visit.php
5 years ago
VisitExcluded.php
6 years ago
VisitInterface.php
6 years ago
Visitor.php
6 years ago
VisitorNotFoundInDb.php
6 years ago
VisitorRecognizer.php
6 years ago
PageUrl.php
385 lines
| 1 | <?php |
| 2 | /** |
| 3 | * Piwik - free/libre analytics platform |
| 4 | * |
| 5 | * @link https://matomo.org |
| 6 | * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later |
| 7 | * |
| 8 | */ |
| 9 | |
| 10 | namespace Piwik\Tracker; |
| 11 | |
| 12 | use Piwik\Common; |
| 13 | use Piwik\Config; |
| 14 | use Piwik\Piwik; |
| 15 | use Piwik\UrlHelper; |
| 16 | |
| 17 | class PageUrl |
| 18 | { |
| 19 | |
| 20 | /** |
| 21 | * Map URL prefixes to integers. |
| 22 | * @see self::normalizeUrl(), self::reconstructNormalizedUrl() |
| 23 | */ |
| 24 | public static $urlPrefixMap = array( |
| 25 | 'http://www.' => 1, |
| 26 | 'http://' => 0, |
| 27 | 'https://www.' => 3, |
| 28 | 'https://' => 2 |
| 29 | ); |
| 30 | |
| 31 | /** |
| 32 | * Given the Input URL, will exclude all query parameters set for this site |
| 33 | * |
| 34 | * @static |
| 35 | * @param $originalUrl |
| 36 | * @param $idSite |
| 37 | * @return bool|string Returned URL is HTML entities decoded |
| 38 | */ |
| 39 | public static function excludeQueryParametersFromUrl($originalUrl, $idSite, $additionalParametersToExclude = []) |
| 40 | { |
| 41 | $originalUrl = self::cleanupUrl($originalUrl); |
| 42 | |
| 43 | $parsedUrl = @parse_url($originalUrl); |
| 44 | $parsedUrl = self::cleanupHostAndHashTag($parsedUrl, $idSite); |
| 45 | $parametersToExclude = array_merge(self::getQueryParametersToExclude($idSite), $additionalParametersToExclude); |
| 46 | if (empty($parsedUrl['query'])) { |
| 47 | if (empty($parsedUrl['fragment'])) { |
| 48 | return UrlHelper::getParseUrlReverse($parsedUrl); |
| 49 | } |
| 50 | |
| 51 | // Exclude from the hash tag as well |
| 52 | $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['fragment']); |
| 53 | $parsedUrl['fragment'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude); |
| 54 | $url = UrlHelper::getParseUrlReverse($parsedUrl); |
| 55 | |
| 56 | return $url; |
| 57 | } |
| 58 | |
| 59 | $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['query']); |
| 60 | $parsedUrl['query'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude); |
| 61 | $url = UrlHelper::getParseUrlReverse($parsedUrl); |
| 62 | |
| 63 | return $url; |
| 64 | } |
| 65 | |
| 66 | /** |
| 67 | * Returns the array of parameters names that must be excluded from the Query String in all tracked URLs |
| 68 | * @static |
| 69 | * @param $idSite |
| 70 | * @return array |
| 71 | */ |
| 72 | public static function getQueryParametersToExclude($idSite) |
| 73 | { |
| 74 | $campaignTrackingParameters = Common::getCampaignParameters(); |
| 75 | |
| 76 | $campaignTrackingParameters = array_merge( |
| 77 | $campaignTrackingParameters[0], // campaign name parameters |
| 78 | $campaignTrackingParameters[1] // campaign keyword parameters |
| 79 | ); |
| 80 | |
| 81 | $website = Cache::getCacheWebsiteAttributes($idSite); |
| 82 | $excludedParameters = self::getExcludedParametersFromWebsite($website); |
| 83 | |
| 84 | $parametersToExclude = array_merge($excludedParameters, |
| 85 | self::getUrlParameterNamesToExcludeFromUrl(), |
| 86 | $campaignTrackingParameters); |
| 87 | |
| 88 | /** |
| 89 | * Triggered before setting the action url in Piwik\Tracker\Action so plugins can register |
| 90 | * parameters to be excluded from the tracking URL (e.g. campaign parameters). |
| 91 | * |
| 92 | * @param array &$parametersToExclude An array of parameters to exclude from the tracking url. |
| 93 | */ |
| 94 | Piwik::postEvent('Tracker.PageUrl.getQueryParametersToExclude', array(&$parametersToExclude)); |
| 95 | |
| 96 | if (!empty($parametersToExclude)) { |
| 97 | Common::printDebug('Excluding parameters "' . implode(',', $parametersToExclude) . '" from URL'); |
| 98 | } |
| 99 | |
| 100 | $parametersToExclude = array_map('strtolower', $parametersToExclude); |
| 101 | return $parametersToExclude; |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Returns the list of URL query parameters that should be removed from the tracked URL query string. |
| 106 | * |
| 107 | * @return array |
| 108 | */ |
| 109 | protected static function getUrlParameterNamesToExcludeFromUrl() |
| 110 | { |
| 111 | $paramsToExclude = Config::getInstance()->Tracker['url_query_parameter_to_exclude_from_url']; |
| 112 | $paramsToExclude = explode(",", $paramsToExclude); |
| 113 | $paramsToExclude = array_map('trim', $paramsToExclude); |
| 114 | return $paramsToExclude; |
| 115 | } |
| 116 | |
| 117 | /** |
| 118 | * Returns true if URL fragments should be removed for a specific site, |
| 119 | * false if otherwise. |
| 120 | * |
| 121 | * This function uses the Tracker cache and not the MySQL database. |
| 122 | * |
| 123 | * @param $idSite int The ID of the site to check for. |
| 124 | * @return bool |
| 125 | */ |
| 126 | public static function shouldRemoveURLFragmentFor($idSite) |
| 127 | { |
| 128 | $websiteAttributes = Cache::getCacheWebsiteAttributes($idSite); |
| 129 | return empty($websiteAttributes['keep_url_fragment']); |
| 130 | } |
| 131 | |
| 132 | /** |
| 133 | * Cleans and/or removes the URL fragment of a URL. |
| 134 | * |
| 135 | * @param $urlFragment string The URL fragment to process. |
| 136 | * @param $idSite int|bool If not false, this function will check if URL fragments |
| 137 | * should be removed for the site w/ this ID and if so, |
| 138 | * the returned processed fragment will be empty. |
| 139 | * |
| 140 | * @return string The processed URL fragment. |
| 141 | */ |
| 142 | public static function processUrlFragment($urlFragment, $idSite = false) |
| 143 | { |
| 144 | // if we should discard the url fragment for this site, return an empty string as |
| 145 | // the processed url fragment |
| 146 | if ($idSite !== false |
| 147 | && PageUrl::shouldRemoveURLFragmentFor($idSite) |
| 148 | ) { |
| 149 | return ''; |
| 150 | } else { |
| 151 | // Remove trailing Hash tag in ?query#hash# |
| 152 | if (substr($urlFragment, -1) == '#') { |
| 153 | $urlFragment = substr($urlFragment, 0, strlen($urlFragment) - 1); |
| 154 | } |
| 155 | return $urlFragment; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | /** |
| 160 | * Will cleanup the hostname (some browser do not strolower the hostname), |
| 161 | * and deal ith the hash tag on incoming URLs based on website setting. |
| 162 | * |
| 163 | * @param $parsedUrl |
| 164 | * @param $idSite int|bool The site ID of the current visit. This parameter is |
| 165 | * only used by the tracker to see if we should remove |
| 166 | * the URL fragment for this site. |
| 167 | * @return array |
| 168 | */ |
| 169 | protected static function cleanupHostAndHashTag($parsedUrl, $idSite = false) |
| 170 | { |
| 171 | if (empty($parsedUrl)) { |
| 172 | return $parsedUrl; |
| 173 | } |
| 174 | |
| 175 | if (!empty($parsedUrl['host'])) { |
| 176 | $parsedUrl['host'] = Common::mb_strtolower($parsedUrl['host']); |
| 177 | } |
| 178 | |
| 179 | if (!empty($parsedUrl['fragment'])) { |
| 180 | $parsedUrl['fragment'] = PageUrl::processUrlFragment($parsedUrl['fragment'], $idSite); |
| 181 | } |
| 182 | |
| 183 | return $parsedUrl; |
| 184 | } |
| 185 | |
| 186 | /** |
| 187 | * Converts Matrix URL format |
| 188 | * from http://example.org/thing;paramA=1;paramB=6542 |
| 189 | * to http://example.org/thing?paramA=1¶mB=6542 |
| 190 | * |
| 191 | * @param string $originalUrl |
| 192 | * @return string |
| 193 | */ |
| 194 | public static function convertMatrixUrl($originalUrl) |
| 195 | { |
| 196 | $posFirstSemiColon = strpos($originalUrl, ";"); |
| 197 | |
| 198 | if (false === $posFirstSemiColon) { |
| 199 | return $originalUrl; |
| 200 | } |
| 201 | |
| 202 | $posQuestionMark = strpos($originalUrl, "?"); |
| 203 | $replace = (false === $posQuestionMark); |
| 204 | |
| 205 | if ($posQuestionMark > $posFirstSemiColon) { |
| 206 | $originalUrl = substr_replace($originalUrl, ";", $posQuestionMark, 1); |
| 207 | $replace = true; |
| 208 | } |
| 209 | |
| 210 | if ($replace) { |
| 211 | $originalUrl = substr_replace($originalUrl, "?", strpos($originalUrl, ";"), 1); |
| 212 | $originalUrl = str_replace(";", "&", $originalUrl); |
| 213 | } |
| 214 | |
| 215 | return $originalUrl; |
| 216 | } |
| 217 | |
| 218 | /** |
| 219 | * Clean up string contents (filter, truncate, ...) |
| 220 | * |
| 221 | * @param string $string Dirty string |
| 222 | * @return string |
| 223 | */ |
| 224 | public static function cleanupString($string) |
| 225 | { |
| 226 | $string = trim($string); |
| 227 | $string = str_replace(array("\n", "\r", "\0"), '', $string); |
| 228 | |
| 229 | $limit = Config::getInstance()->Tracker['page_maximum_length']; |
| 230 | $clean = substr($string, 0, $limit); |
| 231 | return $clean; |
| 232 | } |
| 233 | |
| 234 | protected static function reencodeParameterValue($value, $encoding) |
| 235 | { |
| 236 | if (is_string($value)) { |
| 237 | $decoded = urldecode($value); |
| 238 | if (function_exists('mb_check_encoding') |
| 239 | && @mb_check_encoding($decoded, $encoding)) { |
| 240 | $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding)); |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | return $value; |
| 245 | } |
| 246 | |
| 247 | protected static function reencodeParametersArray($queryParameters, $encoding) |
| 248 | { |
| 249 | foreach ($queryParameters as &$value) { |
| 250 | if (is_array($value)) { |
| 251 | $value = self::reencodeParametersArray($value, $encoding); |
| 252 | } else { |
| 253 | $value = PageUrl::reencodeParameterValue($value, $encoding); |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | return $queryParameters; |
| 258 | } |
| 259 | |
| 260 | /** |
| 261 | * Checks if query parameters are of a non-UTF-8 encoding and converts the values |
| 262 | * from the specified encoding to UTF-8. |
| 263 | * This method is used to workaround browser/webapp bugs (see #3450). When |
| 264 | * browsers fail to encode query parameters in UTF-8, the tracker will send the |
| 265 | * charset of the page viewed and we can sometimes work around invalid data |
| 266 | * being stored. |
| 267 | * |
| 268 | * @param array $queryParameters Name/value mapping of query parameters. |
| 269 | * @param bool|string $encoding of the HTML page the URL is for. Used to workaround |
| 270 | * browser bugs & mis-coded webapps. See #3450. |
| 271 | * |
| 272 | * @return array |
| 273 | */ |
| 274 | public static function reencodeParameters(&$queryParameters, $encoding = false) |
| 275 | { |
| 276 | if (function_exists('mb_check_encoding')) { |
| 277 | // if query params are encoded w/ non-utf8 characters (due to browser bug or whatever), |
| 278 | // encode to UTF-8. |
| 279 | if (strtolower($encoding) != 'utf-8' |
| 280 | && $encoding != false |
| 281 | ) { |
| 282 | Common::printDebug("Encoding page URL query parameters to $encoding."); |
| 283 | |
| 284 | $queryParameters = PageUrl::reencodeParametersArray($queryParameters, $encoding); |
| 285 | } |
| 286 | } else { |
| 287 | Common::printDebug("Page charset supplied in tracking request, but mbstring extension is not available."); |
| 288 | } |
| 289 | |
| 290 | return $queryParameters; |
| 291 | } |
| 292 | |
| 293 | public static function cleanupUrl($url) |
| 294 | { |
| 295 | $url = Common::unsanitizeInputValue($url); |
| 296 | $url = PageUrl::cleanupString($url); |
| 297 | $url = PageUrl::convertMatrixUrl($url); |
| 298 | |
| 299 | return $url; |
| 300 | } |
| 301 | |
| 302 | /** |
| 303 | * Build the full URL from the prefix ID and the rest. |
| 304 | * |
| 305 | * @param string $url |
| 306 | * @param integer $prefixId |
| 307 | * @return string |
| 308 | */ |
| 309 | public static function reconstructNormalizedUrl($url, $prefixId) |
| 310 | { |
| 311 | $map = array_flip(self::$urlPrefixMap); |
| 312 | |
| 313 | if ($prefixId !== null && isset($map[$prefixId])) { |
| 314 | $fullUrl = $map[$prefixId] . $url; |
| 315 | } else { |
| 316 | $fullUrl = $url; |
| 317 | } |
| 318 | |
| 319 | // Clean up host & hash tags, for URLs |
| 320 | $parsedUrl = @parse_url($fullUrl); |
| 321 | $parsedUrl = PageUrl::cleanupHostAndHashTag($parsedUrl); |
| 322 | $url = UrlHelper::getParseUrlReverse($parsedUrl); |
| 323 | |
| 324 | if (!empty($url)) { |
| 325 | return $url; |
| 326 | } |
| 327 | |
| 328 | return $fullUrl; |
| 329 | } |
| 330 | |
| 331 | /** |
| 332 | * Extract the prefix from a URL. |
| 333 | * Return the prefix ID and the rest. |
| 334 | * |
| 335 | * @param string $url |
| 336 | * @return array |
| 337 | */ |
| 338 | public static function normalizeUrl($url) |
| 339 | { |
| 340 | foreach (self::$urlPrefixMap as $prefix => $id) { |
| 341 | if (strtolower(substr($url, 0, strlen($prefix))) == $prefix) { |
| 342 | return array( |
| 343 | 'url' => substr($url, strlen($prefix)), |
| 344 | 'prefixId' => $id |
| 345 | ); |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | return array('url' => $url, 'prefixId' => null); |
| 350 | } |
| 351 | |
| 352 | public static function getUrlIfLookValid($url) |
| 353 | { |
| 354 | $url = PageUrl::cleanupString($url); |
| 355 | |
| 356 | if (!UrlHelper::isLookLikeUrl($url)) { |
| 357 | Common::printDebug("WARNING: URL looks invalid and is discarded"); |
| 358 | |
| 359 | return false; |
| 360 | } |
| 361 | |
| 362 | return $url; |
| 363 | } |
| 364 | |
| 365 | private static function getExcludedParametersFromWebsite($website) |
| 366 | { |
| 367 | if (isset($website['excluded_parameters'])) { |
| 368 | return $website['excluded_parameters']; |
| 369 | } |
| 370 | |
| 371 | return array(); |
| 372 | } |
| 373 | |
| 374 | public static function urldecodeValidUtf8($value) |
| 375 | { |
| 376 | $value = urldecode($value); |
| 377 | if (function_exists('mb_check_encoding') |
| 378 | && !@mb_check_encoding($value, 'utf-8') |
| 379 | ) { |
| 380 | return urlencode($value); |
| 381 | } |
| 382 | return $value; |
| 383 | } |
| 384 | } |
| 385 |