projects:digikey_partsdb
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
projects:digikey_partsdb [2013/10/12 08:44] – charliex | projects:digikey_partsdb [2013/10/13 10:11] (current) – charliex | ||
---|---|---|---|
Line 873: | Line 873: | ||
== grabbing FV's == | == grabbing FV's == | ||
- | we need the FV's to crawl each subsection. | + | we need the FV's to crawl each subsection. grab all the above urls, make sure Results per Page = 500. The CSV download is capped at 500 results per fetch, so no point increasing this value. |
* <input type=hidden name=FV value=fff40000, | * <input type=hidden name=FV value=fff40000, | ||
Line 881: | Line 881: | ||
* <a class=" | * <a class=" | ||
- | The page/8 is the total page count, pages start from 1 | + | The page/8" |
grab the FV value and page count, and store for each of the above URL's | grab the FV value and page count, and store for each of the above URL's | ||
== crawl individual pages == | == crawl individual pages == | ||
+ | |||
+ | curl with a valid useragent i used --useragent " | ||
+ | |||
+ | < | ||
+ | curl.exe -o page%1.csv -L -v -G " | ||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | The response has 4 bytes at the front we don't want, so a simple byteskip script or piece of code. | ||
+ | |||
+ | < | ||
+ | |||
+ | #include < | ||
+ | #include < | ||
+ | |||
+ | int main(int argc, | ||
+ | { | ||
+ | FILE *fp,*ofp; | ||
+ | |||
+ | if( argc < 4 ) { | ||
+ | fprintf(stderr," | ||
+ | exit(-1); | ||
+ | } | ||
+ | |||
+ | fp =fopen( argv[1], | ||
+ | if( fp == NULL ) { | ||
+ | fprintf(stderr," | ||
+ | exit(-2); | ||
+ | } | ||
+ | |||
+ | unsigned long length ; | ||
+ | |||
+ | fseek(fp, | ||
+ | |||
+ | length = ftell( fp ) ; | ||
+ | |||
+ | |||
+ | if( length == 0 ) { | ||
+ | |||
+ | fclose( fp ); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-3); | ||
+ | } | ||
+ | |||
+ | unsigned long offset; | ||
+ | |||
+ | //skip offset | ||
+ | offset = strtoul (argv[3], NULL, 0); | ||
+ | |||
+ | if( offset >= length ){ | ||
+ | |||
+ | fclose( fp ); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-5); | ||
+ | } | ||
+ | |||
+ | // set to skip position | ||
+ | fseek(fp, | ||
+ | |||
+ | unsigned char *buffer = NULL; | ||
+ | |||
+ | buffer = (unsigned char *)malloc( length - offset ); | ||
+ | if( buffer == NULL ) { | ||
+ | |||
+ | fclose(fp); | ||
+ | |||
+ | fprintf(stderr," | ||
+ | exit(-6); | ||
+ | } | ||
+ | |||
+ | // read whole buffer. | ||
+ | if( fread(buffer, | ||
+ | fclose(fp); | ||
+ | fprintf(stderr," | ||
+ | exit(-7); | ||
+ | |||
+ | } | ||
+ | |||
+ | // open output file for writing. | ||
+ | ofp = fopen( argv[2], | ||
+ | |||
+ | if( ofp == NULL ) { | ||
+ | fclose(fp); | ||
+ | |||
+ | free( buffer ); | ||
+ | buffer = NULL; | ||
+ | fprintf(stderr," | ||
+ | exit(-8); | ||
+ | } | ||
+ | |||
+ | if( fwrite(buffer, | ||
+ | fclose(fp); | ||
+ | fclose(ofp); | ||
+ | fprintf(stderr," | ||
+ | exit(-9); | ||
+ | } | ||
+ | |||
+ | free( buffer ); | ||
+ | |||
+ | fclose(fp); | ||
+ | fclose(ofp); | ||
+ | |||
+ | |||
+ | return 0; | ||
+ | } | ||
+ | </ | ||
+ | |||
+ | Process all the files. | ||
+ | |||
+ | < | ||
+ | for %a in (*.csv) do byteskip %a o%a 4 | ||
+ | </ | ||
+ | |||
+ | I used one of the online CSV to MYSQL converters, but most of them can't handle the variations in CSV. To create the initial schema for each table i converted one CSV to XLS by importing it into google docs, and then re-exporting it as an XLS then importing that into phpmyadmin, that makes the base schema.< | ||
+ | |||
+ | Rename the table in phpmyadmin or via mysql tool | ||
+ | |||
+ | Then do the final import with the csvtosql tool, (in progress) | ||
+ | |||
projects/digikey_partsdb.1381592654.txt.gz · Last modified: 2013/10/12 08:44 by charliex