initial commit

v0.1
rrr-marble 4 years ago
commit 92e122297f

2
.gitignore vendored

@ -0,0 +1,2 @@
.venv/
src/__pycache__/

@ -0,0 +1,15 @@
# Requirements
### Minimal versions
python 3.7
postgresql 11
### Database schema
includes GIN index column `geodata_search_idx`
```sql
ALTER TABLE geodata ADD COLUMN geodata_search_ts TSVECTOR
GENERATED ALWAYS AS (to_tsvector('russian', geodata.description) STORED
```
(use coalese() if multiple)
```sql
CREATE INDEX geodata_search_idx ON geodata USING GIN (geodata_search_ts);
```

@ -0,0 +1,15 @@
{
"folders": [
{
"path": "."
}
],
"settings": {
"python.defaultInterpreterPath": ".venv/bin/python",
"python.formatting.blackPath": "/usr/bin/black",
"python.linting.flake8Path": "/usr/bin/flake8",
"python.linting.flake8Args": [
"--black-config=''"
]
}
}

@ -0,0 +1,18 @@
anyio==3.5.0
asgiref==3.5.0
click==8.0.4
et-xmlfile==1.1.0
fastapi==0.74.1
greenlet==1.1.2
h11==0.13.0
idna==3.3
openpyxl==3.0.9
psycopg2==2.9.3
pydantic==1.9.0
python-multipart==0.0.5
six==1.16.0
sniffio==1.2.0
SQLAlchemy==1.4.31
starlette==0.17.1
typing_extensions==4.1.1
uvicorn==0.17.5

@ -0,0 +1,14 @@
anyio==3.5.0
et-xmlfile==1.1.0
fastapi==0.74.1
greenlet==1.1.2
idna==3.3
openpyxl==3.0.9
psycopg2==2.9.3
pydantic==1.9.0
python-multipart==0.0.5
six==1.16.0
sniffio==1.2.0
SQLAlchemy==1.4.31
starlette==0.17.1
typing_extensions==4.1.1

@ -0,0 +1,62 @@
from typing import List
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import func
from sqlalchemy import inspect
from . import models, schemas
from .database import Base
def get_item(db: Session, item_id: int):
"""индивидуальные страницы для каждого Описания набора данных"""
return db.query(models.Item).filter(models.Item.id == item_id).first()
def get_item_by_description(db: Session, needle: str, skip: int = 0, limit: int = 20):
"""фильтры для поиска Описаний наборов данных по атрибутам
дополнительный возможный синтаксис в запросах преобразуется
в поисковый запрос содержащий:
- простой текст: переданные слова в любом порядке
- "текст в кавычках": переданные слова в указанном порядке
- OR: какие-либо из переданных слов
- -: не содержащий данного слова
"""
# hardcode russian for now
# built-in postgress websearch_to_tsquery() is good
# https://www.postgresql.org/docs/11/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
# TODO: add requested search columns [0]
items_table = inspect(models.Item).local_table
result = (
db.query(models.Item)
.filter(
items_table.c.ts.op("@@")(func.websearch_to_tsquery('"russian"', needle))
)
.order_by(items_table.c.id)
.offset(skip)
.limit(limit)
.all()
)
return result
def get_items(db: Session, skip: int = 0, limit: int = 20):
"""список доступных в системе Описаний наборов данных"""
return db.query(models.Item).offset(skip).limit(limit).all()
def insert_items(db: Session, items: List[schemas.ItemCreate]):
"""импорт в базу данных Описаний наборов данных"""
before = db.query(models.Item).count()
accepted = len(items)
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-fast-execution-helpers
modeled_items = (models.ItemCreate(**item.dict()) for item in items)
db.add_all(modeled_items)
db.commit()
inserted = db.query(models.Item).count() - before
return (accepted, inserted)
def get_headers(db: Session):
"""полные заголовки таблиц"""
return db.query(models.Header).all()

@ -0,0 +1,18 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# Database connection parameters
SQALCHEMY_DATABASE_URL = "postgresql+psycopg2://geodata:PASSWORD@postgres/geodata"
engine = create_engine(
SQALCHEMY_DATABASE_URL,
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-connect-arguments
executemany_mode="values_only",
executemany_values_page_size=10000,
executemany_batch_page_size=500,
)
SessionLocal = sessionmaker(autocommit=False, bind=engine)
Base = declarative_base()

@ -0,0 +1,139 @@
from datetime import datetime
from re import IGNORECASE, sub as substitute
from typing import List
from urllib.parse import unquote_plus
from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
from sqlalchemy.orm import Session
from . import crud, models, schemas, spreadsheet
from .database import SessionLocal, engine
# TODO: switch for Alembic [2]
models.Base.metadata.create_all(bind=engine)
app = FastAPI()
# Dependency
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
@app.post(
"/items/",
responses={
422: {
"description": "Загружен некорректный файл (ожидался .xlsx). "
"Неизвестный заголовок. Или обнаружены данные в столбце без заголовка. "
"Необходимо использовать файлы установленного образца."
},
},
response_model=schemas.InsertStatus,
)
def create_items(file: UploadFile = File(...), db: Session = Depends(get_db)):
"""импорт в базу данных Описаний наборов данных из Excel файлов"""
try:
start = datetime.now()
# parse spreadsheet into a collection of header and data iterators
# TODO: Multiple worksheets per Workbook or one? [3]
# assume one for now (second one have strange formatting)
with spreadsheet.parse(file=file) as spreadsheet_parse:
sheet = spreadsheet_parse[0]
# TODO use fullfledged validation framework? [6]
# ad-hoc for now:
# fetch known headersfrom the database
# NB: we never check the actual order of the columns, so we might suffer for it
# we have to construct a collection of known headers anyway,
# might as well make it a set
unknown_headers = set(sheet["header"]) - set(
dbh.spreadsheet for dbh in crud.get_headers(db=db)
)
if unknown_headers:
raise HTTPException(
status_code=422,
detail="Unknown headers in the spreadsheet: {}. Check the coordinated format".format(
unknown_headers
),
)
# construct a list of schemas.Item items
spreadsheet_item_list = [
schemas.ItemCreate(
**{
key: row[i]
for i, key in enumerate(schemas.ItemCreate.__fields__.keys())
}
)
for row in sheet["data"]
]
# dump all the data into database
accepted, processed = crud.insert_items(db=db, items=spreadsheet_item_list)
except spreadsheet.InvalidFileException:
raise HTTPException(
status_code=422, detail="Invalid file upload (expected .xlsx)"
)
except spreadsheet.DataInUnnamedColumnException:
raise HTTPException(
status_code=422, detail="Data is found in a column with empty header"
)
return schemas.InsertStatus(
status="Success" if accepted == processed else "Warning",
accepted=accepted,
processed=processed,
process_time=datetime.now() - start,
)
@app.get(
"/item/{item_id}",
response_model=schemas.Item,
responses={404: {"description": "Такой набор данных отсутствует"}},
)
def read_item(item_id: int, db: Session = Depends(get_db)):
"""индивидуальные страницы для каждого Описания набора данных"""
db_item = crud.get_item(db=db, item_id=item_id)
if db_item is None:
raise HTTPException(status_code=404, detail="Item not found")
return db_item
@app.get("/items/", response_model=List[schemas.Item])
def read_items(skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
"""список доступных в системе Описаний наборов данных"""
return crud.get_items(db=db, skip=skip, limit=limit)
@app.get(
"/search/",
response_model=List[schemas.Item],
responses={400: {"description": "Запрос слишком короткий (минимумл 3 символа)"}},
)
def search(q: str, skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
"""фильтры для поиска Описаний наборов данных по атрибутам
дополнительный возможный синтаксис в запросах преобразуется
в поисковый запрос содержащий:
- простой текст: переданные слова в любом порядке
- "текст в кавычках": переданные слова в указанном порядке
- OR ИЛИ: какие-либо из переданных слов
- -: не содержащий данного слова
"""
q = unquote_plus(q)
# replace all full ИЛИ words with OR
q = substitute(r"\bИЛИ\b", "OR", q, flags=IGNORECASE)
if len(q) < 3:
raise HTTPException(status_code=400, detail="Query too short")
return crud.get_item_by_description(db=db, needle=q, skip=skip, limit=limit)
@app.get("/headers/", response_model=List[schemas.Header])
def read_headers(db: Session = Depends(get_db)):
"""полные наименования столбцов таблиц"""
return crud.get_headers(db=db)

@ -0,0 +1,50 @@
from sqlalchemy import Column, Integer, String, DateTime
# we might need vector concat later, then we'll have to bring in sqlalchemy_utils
# https://sqlalchemy-utils.readthedocs.io/en/latest/_modules/sqlalchemy_utils/types/ts_vector.html
from sqlalchemy.dialects.postgresql import TSVECTOR
from .database import Base
class Header(Base):
__tablename__ = "headers"
database = Column(String, primary_key=True)
spreadsheet = Column(String)
class ItemBase(Base):
__tablename__ = "geodata"
id = Column(Integer, primary_key=True, index=True)
fadr = Column(String)
oldid = Column(String)
category = Column(String)
basin = Column(String)
deposit = Column(String)
well = Column(String)
depth = Column(String)
stratum = Column(String)
owner = Column(String)
org = Column(String)
ownercontacts = Column(String)
samplelist = Column(String)
description = Column(String)
formdimentions = Column(String)
datalist = Column(String)
resolution = Column(String)
date = Column(String)
additionalinfo = Column(String)
scanner = Column(String)
comment = Column(String)
continuation = Column(String)
class ItemCreate(ItemBase):
pass
class Item(ItemBase):
geodata_search_ts = Column(TSVECTOR)

@ -0,0 +1,69 @@
from datetime import datetime, timedelta
from typing import Optional, Union
from pydantic import BaseModel
class HeaderBase(BaseModel):
database: str
spreadsheet: str
class HeaderCreate(HeaderBase):
pass
class Header(HeaderBase):
"""Заголовок описания набора геологических данных
по результатам экспедиционных исследований"""
class Config:
orm_mode = True
class ItemBase(BaseModel):
"""Описание набора геологических данных
по результатам экспедиционных исследований
"""
fadr: Optional[str] = None
oldid: Optional[str] = None
category: Optional[str] = None
basin: Optional[str] = None
deposit: Optional[str] = None
well: Optional[str] = None
depth: Optional[str] = None
stratum: Optional[str] = None
owner: Optional[str] = None
org: Optional[str] = None
ownercontacts: Optional[str] = None
samplelist: Optional[str] = None
description: Optional[str] = None
formdimentions: Optional[str] = None
datalist: Optional[str] = None
resolution: Optional[str] = None
date: Optional[Union[datetime, str]] = None
additionalinfo: Optional[str] = None
scanner: Optional[str] = None
comment: Optional[str] = None
continuation: Optional[str] = None
class ItemCreate(ItemBase):
pass
class Item(ItemBase):
id: int
class Config:
orm_mode = True
class InsertStatus(BaseModel):
"""basic insertation status metrics"""
status: str
accepted: int
processed: int
process_time: timedelta

@ -0,0 +1,103 @@
from openpyxl import Workbook, load_workbook
from openpyxl.utils.exceptions import InvalidFileException
from fastapi import File, UploadFile
from asyncio import run as asyncio_run
from contextlib import closing, contextmanager
from itertools import chain
from os import remove
from pathlib import Path
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
class DataInUnnamedColumnException(Exception):
"""all the columns containing any data have to me named"""
pass
@contextmanager
def parse(file: UploadFile = File(...)):
"""returns a dict with a pair of iterators
for each sheet in the spreadsheet in a list
[
{"header": header_iterator,"data": data_iterator}, # sheet1
{"header": header_iterator,"data": data_iterator}, # sheet2
# etc
]
"""
# prepare return list
result = []
suffix = Path(file.filename).suffix
try:
# TODO: decide if we use subdir in /tmp here, then create it [5]
with NamedTemporaryFile(delete=False, suffix=suffix, dir="/tmp/notgeo") as tmp:
copyfileobj(file.file, tmp)
spreadsheet_file = Path(tmp.name)
# Unlike a normal workbook, a read-only workbook will use lazy loading.
# The workbook must be explicitly closed with the close() method.
# we use contextlib.closing() to do it for us
with closing(load_workbook(spreadsheet_file, read_only=True)) as wb:
# TODO: Multiple worksheets per Workbook or one? [3]
# assume one for now (second one have strange formatting)
ws = wb.active
# assume headers are in the top row
header = [
cell
for row in ws.iter_rows(max_row=1, values_only=True)
for cell in row
]
# assume data stretch is continuous
# find first occurence of None in header and assert that
# no unnamed column contains any data
# .index is 0-based and min_col= is 1-based so we want to
# go for header.index(None)+1
# unpack row generator into separate cell iterators, chain them together,
# and make sure none of the cells contains any data
try:
last_column_with_header = header.index(None)
if any(
chain(
*(
ws.iter_rows(
min_col=last_column_with_header + 1, values_only=True
)
)
)
):
raise DataInUnnamedColumnException(
"Data is found in a column with empty header"
)
except ValueError:
# header.index(None) couldn't find anything,
# all cell in the first row are filled,
# so all columns have headers,
# we can safely continue
last_column_with_header = len(header)
# Construct spreadsheet data iterator
# .index is 0-based and max_col= is 1-based so we might've wanted to
# go for header.index(None)+1, but max_col= range is including
# and we want to only include non-empty columns, and go for
# the previous one (-1) so in the end
# max_col=header.index(None)+1-1
data = ws.iter_rows(
min_row=2, max_col=last_column_with_header, values_only=True
)
result.append(
{"header": (cell for cell in header if cell is not None), "data": data}
)
# END [3]
yield result
finally:
# clean up by explicitly closing the files and removing temporary spreadsheet
asyncio_run(file.close())
tmp.close()
remove(spreadsheet_file)
Loading…
Cancel
Save